[llvm] [WIP][X86] combineX86ShufflesRecursively - attempt to combine shuffles with larger types from EXTRACT_SUBVECTOR nodes (PR #133947)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 2 02:10:11 PDT 2025


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/133947

>From b54089c363d54bbb754cb9de5b184d483953e5d2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 1 Apr 2025 18:30:11 +0100
Subject: [PATCH] [WIP][X86] combineX86ShufflesRecursively - attempt to combine
 shuffles with larger types from EXTRACT_SUBVECTOR nodes

This replaces the rather limited combineX86ShuffleChainWithExtract function with handling for EXTRACT_SUBVECTOR node as we recurse down the shuffle chain, widening the shuffle mask to accomodate the larger value type.

This will mainly help AVX2/AVX512 cases with cross-lane shuffles, but it also helps collapse some cases where the same subvector has gotten reused in multiple lanes.

Exposed missing DemandedElts handling inside ISD::TRUNCATE nodes for ComputeNumSignBits
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |     3 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   230 +-
 .../any_extend_vector_inreg_of_broadcast.ll   |   432 +-
 ...d_vector_inreg_of_broadcast_from_memory.ll |    60 +-
 llvm/test/CodeGen/X86/avx512-cvt.ll           |     8 +-
 llvm/test/CodeGen/X86/avx512-hadd-hsub.ll     |     7 +-
 .../X86/avx512-shuffles/partial_permute.ll    |   995 +-
 llvm/test/CodeGen/X86/bfloat.ll               |    53 +-
 llvm/test/CodeGen/X86/combine-pmuldq.ll       |    26 +-
 llvm/test/CodeGen/X86/combine-sra.ll          |    60 +-
 .../copy-low-subvec-elt-to-high-subvec-elt.ll |    38 +-
 llvm/test/CodeGen/X86/haddsub-undef.ll        |    93 +-
 llvm/test/CodeGen/X86/haddsub.ll              |    66 +-
 .../test/CodeGen/X86/known-signbits-vector.ll |    66 +-
 llvm/test/CodeGen/X86/madd.ll                 |     6 +-
 llvm/test/CodeGen/X86/masked_compressstore.ll |   453 +-
 .../test/CodeGen/X86/masked_gather_scatter.ll |    38 +-
 llvm/test/CodeGen/X86/masked_store.ll         |    44 +-
 llvm/test/CodeGen/X86/matrix-multiply.ll      |    54 +-
 .../CodeGen/X86/min-legal-vector-width.ll     |    10 +-
 llvm/test/CodeGen/X86/oddshuffles.ll          |     8 +-
 llvm/test/CodeGen/X86/pmul.ll                 |    35 +-
 llvm/test/CodeGen/X86/pr132844.ll             |    10 +-
 llvm/test/CodeGen/X86/pr29112.ll              |     4 +-
 llvm/test/CodeGen/X86/pr62286.ll              |     5 +-
 llvm/test/CodeGen/X86/pr95278.ll              |     4 +-
 llvm/test/CodeGen/X86/pr97968.ll              |     4 +-
 llvm/test/CodeGen/X86/shift-i512.ll           |    18 +-
 llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll |   384 +-
 llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll |    24 +-
 llvm/test/CodeGen/X86/trunc-subvector.ll      |    19 +-
 .../CodeGen/X86/vec-strict-fptoint-256.ll     |    70 +-
 .../CodeGen/X86/vec-strict-fptoint-512.ll     |    33 +-
 .../CodeGen/X86/vec-strict-inttofp-256.ll     |   844 +-
 .../CodeGen/X86/vec-strict-inttofp-512.ll     |   152 +-
 llvm/test/CodeGen/X86/vec_fp_to_int.ll        |    52 +-
 llvm/test/CodeGen/X86/vector-compress.ll      |    39 +-
 .../CodeGen/X86/vector-half-conversions.ll    |   624 +-
 .../vector-interleaved-load-i16-stride-3.ll   |  3766 ++--
 .../vector-interleaved-load-i16-stride-4.ll   |  2028 +--
 .../vector-interleaved-load-i16-stride-5.ll   |  9492 +++++-----
 .../vector-interleaved-load-i16-stride-6.ll   |  8731 +++++----
 .../vector-interleaved-load-i16-stride-7.ll   |  7156 ++++----
 .../vector-interleaved-load-i16-stride-8.ll   |  6688 +++----
 .../vector-interleaved-load-i32-stride-3.ll   |   566 +-
 .../vector-interleaved-load-i32-stride-5.ll   |   390 +-
 .../vector-interleaved-load-i32-stride-6.ll   |  2544 +--
 .../vector-interleaved-load-i32-stride-7.ll   |  3506 ++--
 .../vector-interleaved-load-i64-stride-3.ll   |   128 +-
 .../vector-interleaved-load-i64-stride-4.ll   |    88 +-
 .../vector-interleaved-load-i64-stride-5.ll   |   368 +-
 .../vector-interleaved-load-i64-stride-6.ll   |   132 +-
 .../vector-interleaved-load-i64-stride-7.ll   | 14817 ++++++++--------
 .../vector-interleaved-load-i64-stride-8.ll   |   220 +-
 .../vector-interleaved-load-i8-stride-3.ll    |    71 +-
 .../vector-interleaved-load-i8-stride-5.ll    |   296 +-
 .../vector-interleaved-load-i8-stride-7.ll    |  8000 ++++-----
 .../vector-interleaved-load-i8-stride-8.ll    | 10748 +++++------
 .../vector-interleaved-store-i16-stride-3.ll  |  1492 +-
 .../vector-interleaved-store-i16-stride-5.ll  |   188 +-
 .../vector-interleaved-store-i16-stride-6.ll  |  2250 +--
 .../vector-interleaved-store-i16-stride-7.ll  |  8552 +++++----
 .../vector-interleaved-store-i32-stride-6.ll  |  2094 +--
 .../vector-interleaved-store-i32-stride-7.ll  |   946 +-
 .../vector-interleaved-store-i32-stride-8.ll  |     2 +-
 .../vector-interleaved-store-i64-stride-5.ll  |   631 +-
 .../vector-interleaved-store-i8-stride-5.ll   |   808 +-
 .../vector-interleaved-store-i8-stride-6.ll   |   486 +-
 .../vector-interleaved-store-i8-stride-7.ll   |  5512 +++---
 .../vector-interleaved-store-i8-stride-8.ll   |   645 +-
 llvm/test/CodeGen/X86/vector-llrint.ll        |    16 +-
 llvm/test/CodeGen/X86/vector-lrint.ll         |    16 +-
 llvm/test/CodeGen/X86/vector-reduce-fadd.ll   |   558 +-
 llvm/test/CodeGen/X86/vector-reduce-fmax.ll   |   101 +-
 llvm/test/CodeGen/X86/vector-reduce-fmin.ll   |   101 +-
 llvm/test/CodeGen/X86/vector-reduce-fmul.ll   |   222 +-
 .../CodeGen/X86/vector-shuffle-128-unpck.ll   |    60 +-
 .../CodeGen/X86/vector-shuffle-256-v16.ll     |   105 +-
 .../CodeGen/X86/vector-shuffle-256-v32.ll     |    52 +-
 .../test/CodeGen/X86/vector-shuffle-256-v4.ll |    56 +-
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll |   353 +-
 .../CodeGen/X86/vector-shuffle-512-v16.ll     |    41 +-
 .../CodeGen/X86/vector-shuffle-512-v64.ll     |   124 +-
 .../test/CodeGen/X86/vector-shuffle-512-v8.ll |     3 +-
 .../test/CodeGen/X86/vector-shuffle-avx512.ll |    93 +-
 .../X86/vector-shuffle-combining-avx.ll       |    76 +-
 .../CodeGen/X86/vector-shuffle-combining.ll   |    25 +-
 llvm/test/CodeGen/X86/vector-shuffle-v192.ll  |    62 +-
 llvm/test/CodeGen/X86/vector-trunc.ll         |    56 +-
 llvm/test/CodeGen/X86/vselect-packss.ll       |     6 +-
 llvm/test/CodeGen/X86/widen_fadd.ll           |    31 +-
 llvm/test/CodeGen/X86/widen_fdiv.ll           |    48 +-
 llvm/test/CodeGen/X86/widen_fmul.ll           |    31 +-
 llvm/test/CodeGen/X86/widen_fsub.ll           |    31 +-
 .../CodeGen/X86/x86-interleaved-access.ll     |   196 +-
 .../CodeGen/X86/zero_extend_vector_inreg.ll   |    70 +-
 .../zero_extend_vector_inreg_of_broadcast.ll  |   911 +-
 ...d_vector_inreg_of_broadcast_from_memory.ll |   191 +-
 98 files changed, 56574 insertions(+), 55473 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3526beeb312ce..2da00a42f0f9c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5067,7 +5067,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   case ISD::TRUNCATE: {
     // Check if the sign bits of source go down as far as the truncated value.
     unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
-    unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    unsigned NumSrcSignBits =
+        ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
     if (NumSrcSignBits > (NumSrcBits - VTBits))
       return NumSrcSignBits - (NumSrcBits - VTBits);
     break;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8e6a891444bf1..6b7d82bdcedf1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39653,13 +39653,6 @@ static bool matchBinaryPermuteShuffle(
   return false;
 }
 
-static SDValue combineX86ShuffleChainWithExtract(
-    ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
-    ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
-    bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
-    bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
-    const X86Subtarget &Subtarget);
-
 /// Combine an arbitrary chain of shuffles into a single instruction if
 /// possible.
 ///
@@ -40203,14 +40196,6 @@ static SDValue combineX86ShuffleChain(
       return DAG.getBitcast(RootVT, Res);
     }
 
-    // If that failed and either input is extracted then try to combine as a
-    // shuffle with the larger type.
-    if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
-            Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
-            AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
-            IsMaskedShuffle, DAG, DL, Subtarget))
-      return WideShuffle;
-
     // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
     // (non-VLX will pad to 512-bit shuffles).
     if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
@@ -40376,14 +40361,6 @@ static SDValue combineX86ShuffleChain(
     return DAG.getBitcast(RootVT, Res);
   }
 
-  // If that failed and either input is extracted then try to combine as a
-  // shuffle with the larger type.
-  if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
-          Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
-          AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
-          DAG, DL, Subtarget))
-    return WideShuffle;
-
   // If we have a dual input shuffle then lower to VPERMV3,
   // (non-VLX will pad to 512-bit shuffles)
   if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
@@ -40409,154 +40386,6 @@ static SDValue combineX86ShuffleChain(
   return SDValue();
 }
 
-// Combine an arbitrary chain of shuffles + extract_subvectors into a single
-// instruction if possible.
-//
-// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
-// type size to attempt to combine:
-// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
-// -->
-// extract_subvector(shuffle(x,y,m2),0)
-static SDValue combineX86ShuffleChainWithExtract(
-    ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
-    ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
-    bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
-    bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
-    const X86Subtarget &Subtarget) {
-  unsigned NumMaskElts = BaseMask.size();
-  unsigned NumInputs = Inputs.size();
-  if (NumInputs == 0)
-    return SDValue();
-
-  unsigned RootSizeInBits = RootVT.getSizeInBits();
-  unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
-  assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
-
-  // Peek through subvectors to find widest legal vector.
-  // TODO: Handle ISD::TRUNCATE
-  unsigned WideSizeInBits = RootSizeInBits;
-  for (SDValue Input : Inputs) {
-    Input = peekThroughBitcasts(Input);
-    while (1) {
-      if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-        Input = peekThroughBitcasts(Input.getOperand(0));
-        continue;
-      }
-      if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
-          Input.getOperand(0).isUndef()) {
-        Input = peekThroughBitcasts(Input.getOperand(1));
-        continue;
-      }
-      break;
-    }
-    if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
-        WideSizeInBits < Input.getValueSizeInBits())
-      WideSizeInBits = Input.getValueSizeInBits();
-  }
-
-  // Bail if we fail to find a source larger than the existing root.
-  unsigned Scale = WideSizeInBits / RootSizeInBits;
-  if (WideSizeInBits <= RootSizeInBits ||
-      (WideSizeInBits % RootSizeInBits) != 0)
-    return SDValue();
-
-  // Create new mask for larger type.
-  SmallVector<int, 64> WideMask(BaseMask);
-  for (int &M : WideMask) {
-    if (M < 0)
-      continue;
-    M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
-  }
-  WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
-
-  // Attempt to peek through inputs and adjust mask when we extract from an
-  // upper subvector.
-  int AdjustedMasks = 0;
-  SmallVector<SDValue, 4> WideInputs(Inputs);
-  for (unsigned I = 0; I != NumInputs; ++I) {
-    SDValue &Input = WideInputs[I];
-    Input = peekThroughBitcasts(Input);
-    while (1) {
-      if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-          Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
-        uint64_t Idx = Input.getConstantOperandVal(1);
-        if (Idx != 0) {
-          ++AdjustedMasks;
-          unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
-          Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
-
-          int lo = I * WideMask.size();
-          int hi = (I + 1) * WideMask.size();
-          for (int &M : WideMask)
-            if (lo <= M && M < hi)
-              M += Idx;
-        }
-        Input = peekThroughBitcasts(Input.getOperand(0));
-        continue;
-      }
-      // TODO: Handle insertions into upper subvectors.
-      if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
-          Input.getOperand(0).isUndef() &&
-          isNullConstant(Input.getOperand(2))) {
-        Input = peekThroughBitcasts(Input.getOperand(1));
-        continue;
-      }
-      break;
-    }
-  }
-
-  // Remove unused/repeated shuffle source ops.
-  resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
-  assert(!WideInputs.empty() && "Shuffle with no inputs detected");
-
-  // Bail if we're always extracting from the lowest subvectors,
-  // combineX86ShuffleChain should match this for the current width, or the
-  // shuffle still references too many inputs.
-  if (AdjustedMasks == 0 || WideInputs.size() > 2)
-    return SDValue();
-
-  // Minor canonicalization of the accumulated shuffle mask to make it easier
-  // to match below. All this does is detect masks with sequential pairs of
-  // elements, and shrink them to the half-width mask. It does this in a loop
-  // so it will reduce the size of the mask to the minimal width mask which
-  // performs an equivalent shuffle.
-  while (WideMask.size() > 1) {
-    SmallVector<int, 64> WidenedMask;
-    if (!canWidenShuffleElements(WideMask, WidenedMask))
-      break;
-    WideMask = std::move(WidenedMask);
-  }
-
-  // Canonicalization of binary shuffle masks to improve pattern matching by
-  // commuting the inputs.
-  if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
-    ShuffleVectorSDNode::commuteMask(WideMask);
-    std::swap(WideInputs[0], WideInputs[1]);
-  }
-
-  // Increase depth for every upper subvector we've peeked through.
-  Depth += AdjustedMasks;
-
-  // Attempt to combine wider chain.
-  // TODO: Can we use a better Root?
-  SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
-                             WideInputs.back().getValueSizeInBits()
-                         ? WideInputs.front()
-                         : WideInputs.back();
-  assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
-         "WideRootSize mismatch");
-
-  if (SDValue WideShuffle = combineX86ShuffleChain(
-          WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
-          Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
-          IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
-    WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
-    return DAG.getBitcast(RootVT, WideShuffle);
-  }
-
-  return SDValue();
-}
-
 // Canonicalize the combined shuffle mask chain with horizontal ops.
 // NOTE: This may update the Ops and Mask.
 static SDValue canonicalizeShuffleMaskWithHorizOp(
@@ -40969,6 +40798,57 @@ static SDValue combineX86ShufflesRecursively(
     OpMask.assign(NumElts, SM_SentinelUndef);
     std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
     OpZero = OpUndef = APInt::getZero(NumElts);
+  } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+             TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
+             Op.getOperand(0).getValueSizeInBits() > RootSizeInBits &&
+             (Op.getOperand(0).getValueSizeInBits() % RootSizeInBits) == 0) {
+    // Extracting from vector larger than RootVT - scale the mask and attempt to
+    // fold the shuffle with the larger root type, then extract the lower
+    // elements.
+    unsigned Scale = Op.getOperand(0).getValueSizeInBits() / RootSizeInBits;
+    MVT NewRootVT = MVT::getVectorVT(RootVT.getScalarType(),
+                                     Scale * RootVT.getVectorNumElements());
+    SmallVector<int, 64> NewRootMask(RootMask);
+    NewRootMask.append((Scale - 1) * RootMask.size(), SM_SentinelUndef);
+    for (int &M : NewRootMask)
+      if (0 <= M)
+        M = (M % RootMask.size()) +
+            ((M / RootMask.size()) * NewRootMask.size());
+    // If we're using the lowest subvector, just replace it directly in the src
+    // ops/nodes.
+    SmallVector<SDValue, 16> NewSrcOps(SrcOps);
+    SmallVector<const SDNode *, 16> NewSrcNodes(SrcNodes);
+    if (isNullConstant(Op.getOperand(1))) {
+      NewSrcOps[SrcOpIndex] = Op.getOperand(0);
+      NewSrcNodes.push_back(Op.getNode());
+    }
+    // Don't increase the combine depth - we're effectively working on the same
+    // nodes, just with a wider type.
+    if (SDValue WideShuffle = combineX86ShufflesRecursively(
+            NewSrcOps, SrcOpIndex, RootOpc, NewRootVT, NewRootMask, NewSrcNodes,
+            Depth, MaxDepth, AllowVariableCrossLaneMask,
+            AllowVariablePerLaneMask, IsMaskedShuffle, DAG, DL, Subtarget))
+      return DAG.getBitcast(
+          RootVT, extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits));
+    return SDValue();
+  } else if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
+             Op.getOperand(1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+             Op.getOperand(1).getOperand(0).getValueSizeInBits() >
+                 RootSizeInBits) {
+    // If we're inserting an subvector extracted from a vector larger than
+    // RootVT, then combine the insert_subvector as a shuffle, the extract_subvector
+    // will be folded in a later recursion.
+    SDValue BaseVec = Op.getOperand(0);
+    SDValue SubVec = Op.getOperand(1);
+    int InsertIdx = Op.getConstantOperandVal(2);
+    unsigned NumBaseElts = VT.getVectorNumElements();
+    unsigned NumSubElts = SubVec.getValueType().getVectorNumElements();
+    OpInputs.assign({BaseVec, SubVec});
+    OpMask.assign(NumBaseElts, SM_SentinelUndef);
+    std::iota(OpMask.begin(), OpMask.end(), 0);
+    std::iota(OpMask.begin() + InsertIdx,
+              OpMask.begin() + InsertIdx + NumSubElts, NumBaseElts);
+    OpZero = OpUndef = APInt::getZero(NumBaseElts);
   } else {
     return SDValue();
   }
@@ -41324,12 +41204,7 @@ static SDValue combineX86ShufflesRecursively(
       return SDValue();
   }
 
-  // If that failed and any input is extracted then try to combine as a
-  // shuffle with the larger type.
-  return combineX86ShuffleChainWithExtract(
-      Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
-      AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
-      DAG, DL, Subtarget);
+  return SDValue();
 }
 
 /// Helper entry wrapper to combineX86ShufflesRecursively.
@@ -43866,6 +43741,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     case X86ISD::UNPCKL:
     case X86ISD::UNPCKH:
     case X86ISD::BLENDI:
+    case X86ISD::SHUFP:
       // Integer ops.
     case X86ISD::PACKSS:
     case X86ISD::PACKUS:
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 6f4e7abda8b00..cb7da854e26d4 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -747,31 +747,23 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX512BW-FAST-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX512BW-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX512BW-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
+; AVX512BW-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -865,31 +857,19 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -941,73 +921,91 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
 ; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
 ; AVX-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vzeroupper
-; AVX2-FAST-PERLANE-NEXT:    retq
-;
-; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
-; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512F-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512F-SLOW:       # %bb.0:
+; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX512F-SLOW-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-SLOW-NEXT:    vzeroupper
+; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512F-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512F-FAST:       # %bb.0:
+; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-FAST-NEXT:    vzeroupper
+; AVX512F-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX512DQ-SLOW-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512BW-SLOW:       # %bb.0:
+; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-SLOW-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT:    vzeroupper
+; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2011,28 +2009,16 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2163,28 +2149,16 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2315,28 +2289,16 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512BW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
+; AVX512BW-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4512,30 +4474,17 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4646,10 +4595,9 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
 ; AVX512BW-FAST:       # %bb.0:
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
 ; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
 ; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-FAST-NEXT:    vzeroupper
@@ -4769,30 +4717,17 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-NEXT:    vpermq %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4890,28 +4825,16 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,7]
-; AVX512BW-SLOW-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,7]
+; AVX512BW-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -5576,6 +5499,9 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; AVX1-ONLY: {{.*}}
+; AVX2-FAST: {{.*}}
+; AVX2-FAST-PERLANE: {{.*}}
+; AVX2-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}
 ; FALLBACK1: {{.*}}
 ; FALLBACK10: {{.*}}
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 52f856befa130..9dbfe3bbeb9e4 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -641,25 +641,16 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpinsrw $6, (%rdi), %xmm0, %xmm0
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [16,25,16,27,16,29,0,23]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT:    vpermt2w (%rdi), %ymm0, %ymm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],mem[7]
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64
   %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
   %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
@@ -735,25 +726,18 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
+; AVX512BW-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64
   %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
   %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index a78d97782e6a3..b25fb7631fadd 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -179,11 +179,11 @@ define <4 x i64> @f64to4sl(<4 x double> %a) {
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
-; NODQ-NEXT:    vmovq %rax, %xmm2
-; NODQ-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
 ; NODQ-NEXT:    vmovq %rax, %xmm1
-; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NODQ-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; NODQ-NEXT:    vcvttsd2si %xmm2, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; NODQ-NEXT:    vcvttsd2si %xmm0, %rax
 ; NODQ-NEXT:    vmovq %rax, %xmm2
 ; NODQ-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
index ae3777453c68d..c9c5666079e58 100644
--- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
+++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
@@ -221,9 +221,10 @@ define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) {
 ;
 ; SKX-LABEL: fsub_noundef_ee:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
-; SKX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; SKX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [5,4]
+; SKX-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
+; SKX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index a84424bf7dea9..7e2364a1350bc 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -1374,11 +1374,10 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [5,1,3,4]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [13,9,11,12]
 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1391,11 +1390,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,1,3,4]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,9,11,12]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
@@ -1773,24 +1771,13 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32
 }
 
 define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
-; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [12,9,4,10]
-; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT:    vzeroupper
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,0,2]
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpermd %ymm2, %ymm1, %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,1,4,3]
-; CHECK-FAST-PERLANE-NEXT:    vpermi2d %xmm2, %xmm1, %xmm0
-; CHECK-FAST-PERLANE-NEXT:    vzeroupper
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [12,9,4,10]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
   ret <4 x i32> %res
 }
@@ -1930,103 +1917,101 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64>
 }
 
 define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
-; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask0:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [6,7,6,5]
+; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask0:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
   ret <4 x i64> %res
 }
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
-; CHECK-NEXT:    retq
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
-; CHECK-NEXT:    retq
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
+; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,7,6,5]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
+; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
 ; CHECK-FAST-PERLANE-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-PERLANE-NEXT:    retq
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [6,4,6,1]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [6,7,6,5]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
+; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1]
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [6,4,6,1]
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3]
-; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
-; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa %ymm1, %ymm0
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3]
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
@@ -2034,21 +2019,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64
 }
 
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [6,3,6,3]
-; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
-; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [6,3,6,3]
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -2064,9 +2041,8 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask3:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3]
+; CHECK-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,6,7,4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,0,0,3]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
   ret <4 x i64> %res
@@ -2083,8 +2059,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,6,7,4,5,6,7]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
 ; CHECK-FAST-PERLANE-NEXT:    vmovdqa %ymm1, %ymm0
@@ -2106,8 +2081,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,6,7,4,5,6,7]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2117,23 +2091,14 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i
   ret <4 x i64> %res
 }
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,7,7,5]
-; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
-; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1]
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa %ymm1, %ymm0
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,7,7,5]
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
@@ -2141,21 +2106,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64
 }
 
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [3,7,7,5]
-; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1]
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [3,7,7,5]
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -2190,40 +2147,24 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i
   ret <4 x i64> %res
 }
 define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
-; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,6,5,3]
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3]
-; CHECK-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,6,5,3]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    retq
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
   ret <4 x i64> %res
 }
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,6,5,3]
-; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
-; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,6,5,3]
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
@@ -2231,22 +2172,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64
 }
 
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,6,5,3]
-; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
-; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,6,5,3]
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -2319,29 +2251,48 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i
 }
 
 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,5,6,7]
+; CHECK-FAST-NEXT:    vptestnmq %xmm2, %xmm2, %k1
+; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-FAST-NEXT:    vzeroupper
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
+; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-FAST-PERLANE-NEXT:    vptestnmq %xmm2, %xmm2, %k1
+; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-FAST-PERLANE-NEXT:    vzeroupper
+; CHECK-FAST-PERLANE-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
   ret <2 x i64> %res
 }
 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [6,5,6,7]
+; CHECK-FAST-NEXT:    vptestnmq %xmm1, %xmm1, %k1
+; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT:    vzeroupper
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-FAST-PERLANE-NEXT:    vptestnmq %xmm1, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
+; CHECK-FAST-PERLANE-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-FAST-PERLANE-NEXT:    vzeroupper
+; CHECK-FAST-PERLANE-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
@@ -2709,41 +2660,27 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64>
 }
 
 define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
-; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,1]
-; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT:    vzeroupper
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovaps 32(%rdi), %xmm0
-; CHECK-FAST-PERLANE-NEXT:    vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,1]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
   ret <2 x i64> %res
 }
 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [4,1]
-; CHECK-FAST-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT:    vzeroupper
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %xmm2
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
-; CHECK-FAST-PERLANE-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [4,1]
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
@@ -2752,22 +2689,14 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %
 }
 
 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [4,1]
-; CHECK-FAST-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT:    vzeroupper
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %xmm1
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
-; CHECK-FAST-PERLANE-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [4,1]
+; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
@@ -2778,9 +2707,12 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64>
 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 48(%rdi), %xmm2
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [6,2]
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
@@ -2792,9 +2724,11 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %
 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 48(%rdi), %xmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [6,2]
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
@@ -2804,25 +2738,44 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64>
 }
 
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
-; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,3,4,5]
+; CHECK-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-FAST-NEXT:    vzeroupper
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
+; CHECK-FAST-PERLANE-NEXT:    vzeroupper
+; CHECK-FAST-PERLANE-NEXT:    retq
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
   ret <4 x float> %res
 }
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
-; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
+; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,3,4,5]
+; CHECK-FAST-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-FAST-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-FAST-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-FAST-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-FAST-NEXT:    vzeroupper
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
+; CHECK-FAST-PERLANE-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-FAST-PERLANE-NEXT:    vzeroupper
+; CHECK-FAST-PERLANE-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
@@ -2830,14 +2783,24 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec,
 }
 
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
-; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,3,4,5]
+; CHECK-FAST-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-FAST-NEXT:    vzeroupper
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
+; CHECK-FAST-PERLANE-NEXT:    vzeroupper
+; CHECK-FAST-PERLANE-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -3298,10 +3261,42 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %v
   ret <4 x float> %res
 }
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
-; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
+; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [8,6,10,6]
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [8,6,10,6]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [8,6,10,6]
+; CHECK-FAST-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3
 ; CHECK-FAST-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-FAST-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
 ; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -3309,27 +3304,26 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec
 ; CHECK-FAST-NEXT:    vzeroupper
 ; CHECK-FAST-NEXT:    retq
 ;
-; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
+; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
-; CHECK-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,6,2,6]
-; CHECK-FAST-PERLANE-NEXT:    vpermi2ps %xmm0, %xmm3, %xmm4
-; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %xmm0, %xmm2, %k1
-; CHECK-FAST-PERLANE-NEXT:    vblendmps %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
+; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $1, %ymm0, %xmm1 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-FAST-PERLANE-NEXT:    vzeroupper
 ; CHECK-FAST-PERLANE-NEXT:    retq
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
+; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [8,6,10,6]
+; CHECK-FAST-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
 ; CHECK-FAST-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
 ; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3337,48 +3331,15 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %v
 ; CHECK-FAST-NEXT:    vzeroupper
 ; CHECK-FAST-NEXT:    retq
 ;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
-; CHECK-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,2,6]
-; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %xmm4, %xmm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
+; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
 ; CHECK-FAST-PERLANE-NEXT:    vzeroupper
 ; CHECK-FAST-PERLANE-NEXT:    retq
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
-  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
-; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm1 {%k1}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
-  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
-; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -3604,9 +3565,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3]
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,7,3]
-; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [14,6,7,11]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3616,12 +3577,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,6,7,3]
-; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [14,6,7,11]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3634,12 +3595,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,6,7,3]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [14,6,7,11]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -4045,25 +4005,15 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double>
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,5]
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
-; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
-; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
-; CHECK-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
-; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
-; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm1, %ymm0
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,1,5,5]
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
@@ -4071,23 +4021,14 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
 }
 
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,5]
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0
-; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
-; CHECK-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,1,5,5]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -4124,42 +4065,25 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double>
   ret <4 x double> %res
 }
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
-; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0]
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd %xmm0, %ymm0
-; CHECK-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    retq
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0]
-; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd %xmm0, %ymm0
-; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
-; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm1, %ymm0
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0]
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
@@ -4167,48 +4091,29 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %v
 }
 
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0]
-; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd %xmm0, %ymm0
-; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,5,0,6]
-; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
-; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],ymm3[1],ymm0[2],ymm3[2]
-; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm1, %ymm0
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,5,0,6]
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
@@ -4216,67 +4121,41 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %v
 }
 
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [3,5,0,6]
-; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
-; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm2[1],ymm0[2],ymm2[2]
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [3,5,0,6]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
-; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,6]
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT:    vzeroupper
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
-; CHECK-FAST-PERLANE-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-FAST-PERLANE-NEXT:    vzeroupper
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,6]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
   ret <2 x double> %res
 }
 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [0,6]
-; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-FAST-NEXT:    vzeroupper
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $3, %zmm0, %xmm3
-; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
-; CHECK-FAST-PERLANE-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
-; CHECK-FAST-PERLANE-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-FAST-PERLANE-NEXT:    vzeroupper
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [0,6]
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
@@ -4284,24 +4163,15 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v
 }
 
 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,6]
-; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT:    vzeroupper
-; CHECK-FAST-NEXT:    retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
-; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
-; CHECK-FAST-PERLANE-NEXT:    vzeroupper
-; CHECK-FAST-PERLANE-NEXT:    retq
+; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,6]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
@@ -4668,23 +4538,42 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp,
 }
 
 define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
-; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm0
-; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,6]
+; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT:    vzeroupper
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %xmm0
+; CHECK-FAST-PERLANE-NEXT:    vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
   ret <2 x double> %res
 }
 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm2
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,6]
+; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT:    vzeroupper
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
@@ -4693,13 +4582,23 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2
 }
 
 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm1
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [1,6]
+; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
+; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT:    vzeroupper
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
+; CHECK-FAST-PERLANE-NEXT:    vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
@@ -4710,10 +4609,13 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp,
 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,12]
+; CHECK-NEXT:    vmovddup 8(%rdi), %xmm3 # xmm3 = mem[0,0]
+; CHECK-NEXT:    vpermt2pd (%rdi), %zmm2, %zmm3
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovapd %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
@@ -4725,10 +4627,13 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2
 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,12]
 ; CHECK-NEXT:    vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vpermt2pd (%rdi), %zmm2, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 8449107f39e91..9eb92b297777f 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1548,14 +1548,15 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $204, %esp
 ; X86-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill
-; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovlps %xmm0, (%esp)
+; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-NEXT:    vmovlps %xmm1, (%esp)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll __truncdfbf2
 ; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    vmovhps %xmm0, (%esp)
+; X86-NEXT:    vpermpd $235, {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; X86-NEXT:    # zmm0 = mem[3,2,2,3,7,6,6,7]
+; X86-NEXT:    vmovlps %xmm0, (%esp)
+; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll __truncdfbf2
 ; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -1672,20 +1673,22 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
 ; FP16:       # %bb.0:
 ; FP16-NEXT:    subq $184, %rsp
 ; FP16-NEXT:    vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; FP16-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; FP16-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; FP16-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
 ; FP16-NEXT:    vzeroupper
 ; FP16-NEXT:    callq __truncdfbf2 at PLT
 ; FP16-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; FP16-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; FP16-NEXT:    vzeroupper
 ; FP16-NEXT:    callq __truncdfbf2 at PLT
 ; FP16-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; FP16-NEXT:    # xmm0 = mem[1,0]
-; FP16-NEXT:    callq __truncdfbf2 at PLT
-; FP16-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; FP16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; FP16-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; FP16-NEXT:    vzeroupper
+; FP16-NEXT:    callq __truncdfbf2 at PLT
+; FP16-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; FP16-NEXT:    # zmm0 = mem[3,2,2,3,7,6,6,7]
 ; FP16-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; FP16-NEXT:    vzeroupper
 ; FP16-NEXT:    callq __truncdfbf2 at PLT
@@ -1721,7 +1724,7 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
 ; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; FP16-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
 ; FP16-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; FP16-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; FP16-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; FP16-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; FP16-NEXT:    addq $184, %rsp
 ; FP16-NEXT:    retq
@@ -1745,13 +1748,12 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
 ; AVXNC-NEXT:    # xmm0 = mem[1,0]
 ; AVXNC-NEXT:    callq __truncdfbf2 at PLT
 ; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVXNC-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVXNC-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVXNC-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVXNC-NEXT:    vzeroupper
 ; AVXNC-NEXT:    callq __truncdfbf2 at PLT
-; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVXNC-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVXNC-NEXT:    vzeroupper
@@ -1763,15 +1765,16 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
 ; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVXNC-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVXNC-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVXNC-NEXT:    vzeroupper
 ; AVXNC-NEXT:    callq __truncdfbf2 at PLT
-; AVXNC-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVXNC-NEXT:    # xmm0 = mem[1,0]
+; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vpermpd $235, (%rsp), %ymm0 # 32-byte Folded Reload
+; AVXNC-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVXNC-NEXT:    vzeroupper
 ; AVXNC-NEXT:    callq __truncdfbf2 at PLT
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %ebx
-; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %ebp
 ; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %r14d
@@ -1781,7 +1784,9 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %r12d
 ; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %r13d
-; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVXNC-NEXT:    vzeroupper
 ; AVXNC-NEXT:    callq __truncdfbf2 at PLT
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %eax
 ; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 0748ca626bcf8..c100f15a68ca2 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -389,17 +389,18 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) {
 ; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm1
 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    movq $-2097152, %rax # imm = 0xFFE00000
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,3,5,7]
 ; AVX2-NEXT:    .p2align 4
 ; AVX2-NEXT:  .LBB7_1: # %loop
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm1, %ymm4
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7]
-; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermd %ymm3, %ymm2, %ymm3
+; AVX2-NEXT:    vpermd %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    subq $-128, %rax
 ; AVX2-NEXT:    jne .LBB7_1
 ; AVX2-NEXT:  # %bb.2: # %end
@@ -565,17 +566,18 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
 ; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm1
 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    movq $-2097152, %rax # imm = 0xFFE00000
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,3,5,7]
 ; AVX2-NEXT:    .p2align 4
 ; AVX2-NEXT:  .LBB8_1: # %loop
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vpmovsxdq 2097168(%rdi,%rax), %ymm2
 ; AVX2-NEXT:    vpmovsxdq 2097152(%rdi,%rax), %ymm3
+; AVX2-NEXT:    vpmovsxdq 2097168(%rdi,%rax), %ymm4
+; AVX2-NEXT:    vpmuldq %ymm4, %ymm1, %ymm4
 ; AVX2-NEXT:    vpmuldq %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpmuldq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7]
-; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermd %ymm3, %ymm2, %ymm3
+; AVX2-NEXT:    vpermd %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    subq $-128, %rax
 ; AVX2-NEXT:    jne .LBB8_1
 ; AVX2-NEXT:  # %bb.2: # %end
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index c982884314f62..c7ce895853f46 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -288,29 +288,13 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-SLOW-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr:
-; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7]
-; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-ALL-NEXT:    vzeroupper
-; AVX2-FAST-ALL-NEXT:    retq
-;
-; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_lshr:
-; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-FAST-PERLANE-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vzeroupper
-; AVX2-FAST-PERLANE-NEXT:    retq
+; AVX2-LABEL: combine_vec_ashr_trunc_lshr:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: combine_vec_ashr_trunc_lshr:
 ; AVX512:       # %bb.0:
@@ -389,29 +373,13 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-SLOW-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr:
-; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7]
-; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-ALL-NEXT:    vzeroupper
-; AVX2-FAST-ALL-NEXT:    retq
-;
-; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_ashr:
-; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-FAST-PERLANE-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vzeroupper
-; AVX2-FAST-PERLANE-NEXT:    retq
+; AVX2-LABEL: combine_vec_ashr_trunc_ashr:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: combine_vec_ashr_trunc_ashr:
 ; AVX512:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
index d9393ba9febb2..0628db719c5bd 100644
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -419,12 +419,19 @@ define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_1_unar
 }
 
 define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_1_binary(<8 x float> %x, <8 x float> %y) nounwind {
-; CHECK-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_1_binary:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vbroadcastss %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
-; CHECK-NEXT:    retq
+; CHECK-SLOW-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK-SLOW:       # %bb.0:
+; CHECK-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; CHECK-SLOW-NEXT:    vbroadcastss %xmm1, %xmm1
+; CHECK-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-SLOW-NEXT:    retq
+;
+; CHECK-FAST-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4,4,4,4]
+; CHECK-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; CHECK-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-FAST-NEXT:    retq
   %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 12, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %r
 }
@@ -547,12 +554,19 @@ define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_unary(<8
 }
 
 define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_binary(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_binary:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vbroadcastss %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
-; CHECK-NEXT:    retq
+; CHECK-SLOW-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK-SLOW:       # %bb.0:
+; CHECK-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; CHECK-SLOW-NEXT:    vbroadcastss %xmm1, %xmm1
+; CHECK-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-SLOW-NEXT:    retq
+;
+; CHECK-FAST-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4,4,4,4]
+; CHECK-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; CHECK-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-FAST-NEXT:    retq
   %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 12, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %r
 }
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index 94fa81742ba71..cc7d8e61505f5 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -1110,17 +1110,17 @@ define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) {
 ; SSE-FAST-NEXT:    movapd %xmm2, %xmm1
 ; SSE-FAST-NEXT:    retq
 ;
-; AVX-SLOW-LABEL: PR34724_add_v4f64_u123:
-; AVX-SLOW:       # %bb.0:
-; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-SLOW-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
-; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
-; AVX-SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-SLOW-NEXT:    retq
+; AVX1-SLOW-LABEL: PR34724_add_v4f64_u123:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-SLOW-NEXT:    retq
 ;
 ; AVX-FAST-LABEL: PR34724_add_v4f64_u123:
 ; AVX-FAST:       # %bb.0:
@@ -1129,6 +1129,18 @@ define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) {
 ; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX-FAST-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
 ; AVX-FAST-NEXT:    retq
+;
+; AVX512-SLOW-LABEL: PR34724_add_v4f64_u123:
+; AVX512-SLOW:       # %bb.0:
+; AVX512-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-SLOW-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
+; AVX512-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX512-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX512-SLOW-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
+; AVX512-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX512-SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX512-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-SLOW-NEXT:    retq
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 2, i32 4>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 3, i32 5>
   %5 = fadd <2 x double> %3, %4
@@ -1161,21 +1173,31 @@ define <4 x double> @PR34724_add_v4f64_0u23(<4 x double> %0, <4 x double> %1) {
 ; SSE-FAST-NEXT:    haddpd %xmm3, %xmm1
 ; SSE-FAST-NEXT:    retq
 ;
-; AVX-SLOW-LABEL: PR34724_add_v4f64_0u23:
-; AVX-SLOW:       # %bb.0:
-; AVX-SLOW-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
-; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
-; AVX-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-SLOW-NEXT:    retq
+; AVX1-SLOW-LABEL: PR34724_add_v4f64_0u23:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-SLOW-NEXT:    retq
 ;
 ; AVX-FAST-LABEL: PR34724_add_v4f64_0u23:
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-FAST-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX-FAST-NEXT:    retq
+;
+; AVX512-SLOW-LABEL: PR34724_add_v4f64_0u23:
+; AVX512-SLOW:       # %bb.0:
+; AVX512-SLOW-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
+; AVX512-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX512-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX512-SLOW-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
+; AVX512-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX512-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-SLOW-NEXT:    retq
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 0, i32 4>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 1, i32 5>
   %5 = fadd <2 x double> %3, %4
@@ -1207,16 +1229,16 @@ define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) {
 ; SSE-FAST-NEXT:    movapd %xmm3, %xmm1
 ; SSE-FAST-NEXT:    retq
 ;
-; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3:
-; AVX-SLOW:       # %bb.0:
-; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX-SLOW-NEXT:    vhaddpd %xmm2, %xmm0, %xmm0
-; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; AVX-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-SLOW-NEXT:    retq
+; AVX1-SLOW-LABEL: PR34724_add_v4f64_01u3:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-SLOW-NEXT:    vhaddpd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-SLOW-NEXT:    retq
 ;
 ; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3:
 ; AVX1-FAST:       # %bb.0:
@@ -1225,6 +1247,17 @@ define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) {
 ; AVX1-FAST-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
 ; AVX1-FAST-NEXT:    retq
 ;
+; AVX512-SLOW-LABEL: PR34724_add_v4f64_01u3:
+; AVX512-SLOW:       # %bb.0:
+; AVX512-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512-SLOW-NEXT:    vhaddpd %xmm2, %xmm0, %xmm0
+; AVX512-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX512-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX512-SLOW-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
+; AVX512-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX512-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-SLOW-NEXT:    retq
+;
 ; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3:
 ; AVX512-FAST:       # %bb.0:
 ; AVX512-FAST-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
index a0778195b5c73..fbc3e20365b8b 100644
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512-SLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST
 
 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
@@ -1038,13 +1038,13 @@ define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
 ; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm0
 ; SSE3-FAST-NEXT:    retq
 ;
-; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
-; AVX-SLOW:       # %bb.0:
-; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-SLOW-NEXT:    vzeroupper
-; AVX-SLOW-NEXT:    retq
+; AVX1-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
 ;
 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
 ; AVX-FAST:       # %bb.0:
@@ -1052,6 +1052,22 @@ define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX512-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
+; AVX512-SLOW:       # %bb.0:
+; AVX512-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX512-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX512-SLOW-NEXT:    vzeroupper
+; AVX512-SLOW-NEXT:    retq
   %x0 = extractelement <4 x double> %x, i32 2
   %x1 = extractelement <4 x double> %x, i32 3
   %x01 = fadd double %x0, %x1
@@ -1103,13 +1119,13 @@ define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
 ; SSE3-FAST-NEXT:    haddpd %xmm1, %xmm0
 ; SSE3-FAST-NEXT:    retq
 ;
-; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
-; AVX-SLOW:       # %bb.0:
-; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; AVX-SLOW-NEXT:    vzeroupper
-; AVX-SLOW-NEXT:    retq
+; AVX1-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
 ;
 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
 ; AVX-FAST:       # %bb.0:
@@ -1117,6 +1133,22 @@ define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX512-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
+; AVX512-SLOW:       # %bb.0:
+; AVX512-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX512-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX512-SLOW-NEXT:    vzeroupper
+; AVX512-SLOW-NEXT:    retq
   %x0 = extractelement <4 x double> %x, i32 2
   %x1 = extractelement <4 x double> %x, i32 3
   %x01 = fadd double %x1, %x0
diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index 45b61155fe626..07abc1bb14010 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -248,36 +248,11 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
 }
 
 define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind {
-; X86-LABEL: signbits_sext_shuffle_sitofp:
-; X86:       # %bb.0:
-; X86-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; X86-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-AVX1-LABEL: signbits_sext_shuffle_sitofp:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; X64-AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: signbits_sext_shuffle_sitofp:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
-; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; X64-AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; X64-AVX2-NEXT:    retq
+; CHECK-LABEL: signbits_sext_shuffle_sitofp:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = sext <4 x i32> %a0 to <4 x i64>
   %2 = shufflevector <4 x i64> %1, <4 x i64>%a1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %3 = sitofp <4 x i64> %2 to <4 x double>
@@ -405,15 +380,11 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    vmovapd 8(%ebp), %xmm3
-; X86-NEXT:    vpsrad $31, %xmm2, %xmm4
-; X86-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; X86-NEXT:    vpsrad $1, %xmm5, %xmm5
-; X86-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; X86-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; X86-NEXT:    vpsrad $1, %xmm4, %xmm4
 ; X86-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; X86-NEXT:    vpsrad $31, %xmm2, %xmm5
 ; X86-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; X86-NEXT:    vpsrad $1, %xmm2, %xmm2
-; X86-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
 ; X86-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
 ; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm6
 ; X86-NEXT:    vextractf128 $1, %ymm1, %xmm1
@@ -421,10 +392,7 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vblendvpd %xmm0, %xmm2, %xmm5, %xmm0
 ; X86-NEXT:    vblendvpd %xmm6, %xmm4, %xmm3, %xmm1
-; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X86-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,0]
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -433,15 +401,11 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ;
 ; X64-AVX1-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vpsrad $31, %xmm2, %xmm4
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; X64-AVX1-NEXT:    vpsrad $1, %xmm5, %xmm5
-; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; X64-AVX1-NEXT:    vpsrad $1, %xmm4, %xmm4
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; X64-AVX1-NEXT:    vpsrad $31, %xmm2, %xmm5
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; X64-AVX1-NEXT:    vpsrad $1, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
 ; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
 ; X64-AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm6
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
@@ -449,10 +413,7 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X64-AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm5, %xmm0
 ; X64-AVX1-NEXT:    vblendvpd %xmm6, %xmm4, %xmm3, %xmm1
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,0]
 ; X64-AVX1-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
@@ -464,9 +425,8 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 ; X64-AVX2-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-AVX2-NEXT:    vmovapd {{.*#+}} xmm1 = [0,0,4,4]
+; X64-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index bdb7c307a5759..cb3016c603a0a 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -1892,8 +1892,10 @@ define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) {
 ; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm1
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; AVX512-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
    %a = sext <16 x i16> %A to <16 x i32>
diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll
index 3187bf6448690..1afb26adaf323 100644
--- a/llvm/test/CodeGen/X86/masked_compressstore.ll
+++ b/llvm/test/CodeGen/X86/masked_compressstore.ll
@@ -155,21 +155,14 @@ define void @compressstore_v8f64_v8i1(ptr %base, <8 x double> %V, <8 x i1> %mask
 ; AVX2-NEXT:    vpacksswb %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovmskb %xmm2, %eax
 ; AVX2-NEXT:    testb $1, %al
-; AVX2-NEXT:    je LBB0_2
-; AVX2-NEXT:  ## %bb.1: ## %cond.store
-; AVX2-NEXT:    vmovq %xmm0, (%rdi)
-; AVX2-NEXT:    addq $8, %rdi
-; AVX2-NEXT:  LBB0_2: ## %else
+; AVX2-NEXT:    jne LBB0_1
+; AVX2-NEXT:  ## %bb.2: ## %else
 ; AVX2-NEXT:    testb $2, %al
-; AVX2-NEXT:    je LBB0_4
-; AVX2-NEXT:  ## %bb.3: ## %cond.store1
-; AVX2-NEXT:    vmovhpd %xmm0, (%rdi)
-; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    jne LBB0_3
 ; AVX2-NEXT:  LBB0_4: ## %else2
 ; AVX2-NEXT:    testb $4, %al
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    jne LBB0_5
-; AVX2-NEXT:  ## %bb.6: ## %else5
+; AVX2-NEXT:  LBB0_6: ## %else5
 ; AVX2-NEXT:    testb $8, %al
 ; AVX2-NEXT:    jne LBB0_7
 ; AVX2-NEXT:  LBB0_8: ## %else8
@@ -177,27 +170,35 @@ define void @compressstore_v8f64_v8i1(ptr %base, <8 x double> %V, <8 x i1> %mask
 ; AVX2-NEXT:    jne LBB0_9
 ; AVX2-NEXT:  LBB0_10: ## %else11
 ; AVX2-NEXT:    testb $32, %al
-; AVX2-NEXT:    je LBB0_12
-; AVX2-NEXT:  LBB0_11: ## %cond.store13
-; AVX2-NEXT:    vmovhps %xmm1, (%rdi)
-; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    jne LBB0_11
 ; AVX2-NEXT:  LBB0_12: ## %else14
 ; AVX2-NEXT:    testb $64, %al
-; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX2-NEXT:    jne LBB0_13
-; AVX2-NEXT:  ## %bb.14: ## %else17
+; AVX2-NEXT:  LBB0_14: ## %else17
 ; AVX2-NEXT:    testb $-128, %al
 ; AVX2-NEXT:    jne LBB0_15
 ; AVX2-NEXT:  LBB0_16: ## %else20
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
-; AVX2-NEXT:  LBB0_5: ## %cond.store4
+; AVX2-NEXT:  LBB0_1: ## %cond.store
 ; AVX2-NEXT:    vmovq %xmm0, (%rdi)
 ; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    je LBB0_4
+; AVX2-NEXT:  LBB0_3: ## %cond.store1
+; AVX2-NEXT:    vmovhpd %xmm0, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testb $4, %al
+; AVX2-NEXT:    je LBB0_6
+; AVX2-NEXT:  LBB0_5: ## %cond.store4
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $8, %al
 ; AVX2-NEXT:    je LBB0_8
 ; AVX2-NEXT:  LBB0_7: ## %cond.store7
-; AVX2-NEXT:    vmovhpd %xmm0, (%rdi)
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT:    vmovq %xmm0, (%rdi)
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $16, %al
 ; AVX2-NEXT:    je LBB0_10
@@ -205,15 +206,21 @@ define void @compressstore_v8f64_v8i1(ptr %base, <8 x double> %V, <8 x i1> %mask
 ; AVX2-NEXT:    vmovlps %xmm1, (%rdi)
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $32, %al
-; AVX2-NEXT:    jne LBB0_11
-; AVX2-NEXT:    jmp LBB0_12
+; AVX2-NEXT:    je LBB0_12
+; AVX2-NEXT:  LBB0_11: ## %cond.store13
+; AVX2-NEXT:    vmovhps %xmm1, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testb $64, %al
+; AVX2-NEXT:    je LBB0_14
 ; AVX2-NEXT:  LBB0_13: ## %cond.store16
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX2-NEXT:    vmovlps %xmm0, (%rdi)
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $-128, %al
 ; AVX2-NEXT:    je LBB0_16
 ; AVX2-NEXT:  LBB0_15: ## %cond.store19
-; AVX2-NEXT:    vmovhps %xmm0, (%rdi)
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm1[3,2,2,3]
+; AVX2-NEXT:    vmovlps %xmm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -380,136 +387,278 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
 ; SSE-NEXT:    movhps %xmm7, (%rdi)
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: compressstore_v16f64_v16i1:
-; AVX1OR2:       ## %bb.0:
-; AVX1OR2-NEXT:    vpsllw $7, %xmm4, %xmm4
-; AVX1OR2-NEXT:    vpmovmskb %xmm4, %eax
-; AVX1OR2-NEXT:    testb $1, %al
-; AVX1OR2-NEXT:    je LBB1_2
-; AVX1OR2-NEXT:  ## %bb.1: ## %cond.store
-; AVX1OR2-NEXT:    vmovlps %xmm0, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:  LBB1_2: ## %else
-; AVX1OR2-NEXT:    testb $2, %al
-; AVX1OR2-NEXT:    je LBB1_4
-; AVX1OR2-NEXT:  ## %bb.3: ## %cond.store1
-; AVX1OR2-NEXT:    vmovhps %xmm0, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:  LBB1_4: ## %else2
-; AVX1OR2-NEXT:    testb $4, %al
-; AVX1OR2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1OR2-NEXT:    jne LBB1_5
-; AVX1OR2-NEXT:  ## %bb.6: ## %else5
-; AVX1OR2-NEXT:    testb $8, %al
-; AVX1OR2-NEXT:    jne LBB1_7
-; AVX1OR2-NEXT:  LBB1_8: ## %else8
-; AVX1OR2-NEXT:    testb $16, %al
-; AVX1OR2-NEXT:    jne LBB1_9
-; AVX1OR2-NEXT:  LBB1_10: ## %else11
-; AVX1OR2-NEXT:    testb $32, %al
-; AVX1OR2-NEXT:    je LBB1_12
-; AVX1OR2-NEXT:  LBB1_11: ## %cond.store13
-; AVX1OR2-NEXT:    vmovhps %xmm1, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:  LBB1_12: ## %else14
-; AVX1OR2-NEXT:    testb $64, %al
-; AVX1OR2-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1OR2-NEXT:    jne LBB1_13
-; AVX1OR2-NEXT:  ## %bb.14: ## %else17
-; AVX1OR2-NEXT:    testb %al, %al
-; AVX1OR2-NEXT:    js LBB1_15
-; AVX1OR2-NEXT:  LBB1_16: ## %else20
-; AVX1OR2-NEXT:    testl $256, %eax ## imm = 0x100
-; AVX1OR2-NEXT:    jne LBB1_17
-; AVX1OR2-NEXT:  LBB1_18: ## %else23
-; AVX1OR2-NEXT:    testl $512, %eax ## imm = 0x200
-; AVX1OR2-NEXT:    je LBB1_20
-; AVX1OR2-NEXT:  LBB1_19: ## %cond.store25
-; AVX1OR2-NEXT:    vmovhps %xmm2, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:  LBB1_20: ## %else26
-; AVX1OR2-NEXT:    testl $1024, %eax ## imm = 0x400
-; AVX1OR2-NEXT:    vextractf128 $1, %ymm2, %xmm0
-; AVX1OR2-NEXT:    jne LBB1_21
-; AVX1OR2-NEXT:  ## %bb.22: ## %else29
-; AVX1OR2-NEXT:    testl $2048, %eax ## imm = 0x800
-; AVX1OR2-NEXT:    jne LBB1_23
-; AVX1OR2-NEXT:  LBB1_24: ## %else32
-; AVX1OR2-NEXT:    testl $4096, %eax ## imm = 0x1000
-; AVX1OR2-NEXT:    jne LBB1_25
-; AVX1OR2-NEXT:  LBB1_26: ## %else35
-; AVX1OR2-NEXT:    testl $8192, %eax ## imm = 0x2000
-; AVX1OR2-NEXT:    je LBB1_28
-; AVX1OR2-NEXT:  LBB1_27: ## %cond.store37
-; AVX1OR2-NEXT:    vmovhps %xmm3, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:  LBB1_28: ## %else38
-; AVX1OR2-NEXT:    testl $16384, %eax ## imm = 0x4000
-; AVX1OR2-NEXT:    vextractf128 $1, %ymm3, %xmm0
-; AVX1OR2-NEXT:    jne LBB1_29
-; AVX1OR2-NEXT:  ## %bb.30: ## %else41
-; AVX1OR2-NEXT:    testl $32768, %eax ## imm = 0x8000
-; AVX1OR2-NEXT:    jne LBB1_31
-; AVX1OR2-NEXT:  LBB1_32: ## %else44
-; AVX1OR2-NEXT:    vzeroupper
-; AVX1OR2-NEXT:    retq
-; AVX1OR2-NEXT:  LBB1_5: ## %cond.store4
-; AVX1OR2-NEXT:    vmovlps %xmm0, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:    testb $8, %al
-; AVX1OR2-NEXT:    je LBB1_8
-; AVX1OR2-NEXT:  LBB1_7: ## %cond.store7
-; AVX1OR2-NEXT:    vmovhps %xmm0, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:    testb $16, %al
-; AVX1OR2-NEXT:    je LBB1_10
-; AVX1OR2-NEXT:  LBB1_9: ## %cond.store10
-; AVX1OR2-NEXT:    vmovlps %xmm1, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:    testb $32, %al
-; AVX1OR2-NEXT:    jne LBB1_11
-; AVX1OR2-NEXT:    jmp LBB1_12
-; AVX1OR2-NEXT:  LBB1_13: ## %cond.store16
-; AVX1OR2-NEXT:    vmovlps %xmm0, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:    testb %al, %al
-; AVX1OR2-NEXT:    jns LBB1_16
-; AVX1OR2-NEXT:  LBB1_15: ## %cond.store19
-; AVX1OR2-NEXT:    vmovhps %xmm0, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:    testl $256, %eax ## imm = 0x100
-; AVX1OR2-NEXT:    je LBB1_18
-; AVX1OR2-NEXT:  LBB1_17: ## %cond.store22
-; AVX1OR2-NEXT:    vmovlps %xmm2, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:    testl $512, %eax ## imm = 0x200
-; AVX1OR2-NEXT:    jne LBB1_19
-; AVX1OR2-NEXT:    jmp LBB1_20
-; AVX1OR2-NEXT:  LBB1_21: ## %cond.store28
-; AVX1OR2-NEXT:    vmovlps %xmm0, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:    testl $2048, %eax ## imm = 0x800
-; AVX1OR2-NEXT:    je LBB1_24
-; AVX1OR2-NEXT:  LBB1_23: ## %cond.store31
-; AVX1OR2-NEXT:    vmovhps %xmm0, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:    testl $4096, %eax ## imm = 0x1000
-; AVX1OR2-NEXT:    je LBB1_26
-; AVX1OR2-NEXT:  LBB1_25: ## %cond.store34
-; AVX1OR2-NEXT:    vmovlps %xmm3, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:    testl $8192, %eax ## imm = 0x2000
-; AVX1OR2-NEXT:    jne LBB1_27
-; AVX1OR2-NEXT:    jmp LBB1_28
-; AVX1OR2-NEXT:  LBB1_29: ## %cond.store40
-; AVX1OR2-NEXT:    vmovlps %xmm0, (%rdi)
-; AVX1OR2-NEXT:    addq $8, %rdi
-; AVX1OR2-NEXT:    testl $32768, %eax ## imm = 0x8000
-; AVX1OR2-NEXT:    je LBB1_32
-; AVX1OR2-NEXT:  LBB1_31: ## %cond.store43
-; AVX1OR2-NEXT:    vmovhps %xmm0, (%rdi)
-; AVX1OR2-NEXT:    vzeroupper
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: compressstore_v16f64_v16i1:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpsllw $7, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovmskb %xmm4, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    je LBB1_2
+; AVX1-NEXT:  ## %bb.1: ## %cond.store
+; AVX1-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:  LBB1_2: ## %else
+; AVX1-NEXT:    testb $2, %al
+; AVX1-NEXT:    je LBB1_4
+; AVX1-NEXT:  ## %bb.3: ## %cond.store1
+; AVX1-NEXT:    vmovhps %xmm0, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:  LBB1_4: ## %else2
+; AVX1-NEXT:    testb $4, %al
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    jne LBB1_5
+; AVX1-NEXT:  ## %bb.6: ## %else5
+; AVX1-NEXT:    testb $8, %al
+; AVX1-NEXT:    jne LBB1_7
+; AVX1-NEXT:  LBB1_8: ## %else8
+; AVX1-NEXT:    testb $16, %al
+; AVX1-NEXT:    jne LBB1_9
+; AVX1-NEXT:  LBB1_10: ## %else11
+; AVX1-NEXT:    testb $32, %al
+; AVX1-NEXT:    je LBB1_12
+; AVX1-NEXT:  LBB1_11: ## %cond.store13
+; AVX1-NEXT:    vmovhps %xmm1, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:  LBB1_12: ## %else14
+; AVX1-NEXT:    testb $64, %al
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    jne LBB1_13
+; AVX1-NEXT:  ## %bb.14: ## %else17
+; AVX1-NEXT:    testb %al, %al
+; AVX1-NEXT:    js LBB1_15
+; AVX1-NEXT:  LBB1_16: ## %else20
+; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
+; AVX1-NEXT:    jne LBB1_17
+; AVX1-NEXT:  LBB1_18: ## %else23
+; AVX1-NEXT:    testl $512, %eax ## imm = 0x200
+; AVX1-NEXT:    je LBB1_20
+; AVX1-NEXT:  LBB1_19: ## %cond.store25
+; AVX1-NEXT:    vmovhps %xmm2, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:  LBB1_20: ## %else26
+; AVX1-NEXT:    testl $1024, %eax ## imm = 0x400
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    jne LBB1_21
+; AVX1-NEXT:  ## %bb.22: ## %else29
+; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
+; AVX1-NEXT:    jne LBB1_23
+; AVX1-NEXT:  LBB1_24: ## %else32
+; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
+; AVX1-NEXT:    jne LBB1_25
+; AVX1-NEXT:  LBB1_26: ## %else35
+; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
+; AVX1-NEXT:    je LBB1_28
+; AVX1-NEXT:  LBB1_27: ## %cond.store37
+; AVX1-NEXT:    vmovhps %xmm3, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:  LBB1_28: ## %else38
+; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT:    jne LBB1_29
+; AVX1-NEXT:  ## %bb.30: ## %else41
+; AVX1-NEXT:    testl $32768, %eax ## imm = 0x8000
+; AVX1-NEXT:    jne LBB1_31
+; AVX1-NEXT:  LBB1_32: ## %else44
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  LBB1_5: ## %cond.store4
+; AVX1-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:    testb $8, %al
+; AVX1-NEXT:    je LBB1_8
+; AVX1-NEXT:  LBB1_7: ## %cond.store7
+; AVX1-NEXT:    vmovhps %xmm0, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:    testb $16, %al
+; AVX1-NEXT:    je LBB1_10
+; AVX1-NEXT:  LBB1_9: ## %cond.store10
+; AVX1-NEXT:    vmovlps %xmm1, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:    testb $32, %al
+; AVX1-NEXT:    jne LBB1_11
+; AVX1-NEXT:    jmp LBB1_12
+; AVX1-NEXT:  LBB1_13: ## %cond.store16
+; AVX1-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:    testb %al, %al
+; AVX1-NEXT:    jns LBB1_16
+; AVX1-NEXT:  LBB1_15: ## %cond.store19
+; AVX1-NEXT:    vmovhps %xmm0, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
+; AVX1-NEXT:    je LBB1_18
+; AVX1-NEXT:  LBB1_17: ## %cond.store22
+; AVX1-NEXT:    vmovlps %xmm2, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:    testl $512, %eax ## imm = 0x200
+; AVX1-NEXT:    jne LBB1_19
+; AVX1-NEXT:    jmp LBB1_20
+; AVX1-NEXT:  LBB1_21: ## %cond.store28
+; AVX1-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
+; AVX1-NEXT:    je LBB1_24
+; AVX1-NEXT:  LBB1_23: ## %cond.store31
+; AVX1-NEXT:    vmovhps %xmm0, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
+; AVX1-NEXT:    je LBB1_26
+; AVX1-NEXT:  LBB1_25: ## %cond.store34
+; AVX1-NEXT:    vmovlps %xmm3, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
+; AVX1-NEXT:    jne LBB1_27
+; AVX1-NEXT:    jmp LBB1_28
+; AVX1-NEXT:  LBB1_29: ## %cond.store40
+; AVX1-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX1-NEXT:    addq $8, %rdi
+; AVX1-NEXT:    testl $32768, %eax ## imm = 0x8000
+; AVX1-NEXT:    je LBB1_32
+; AVX1-NEXT:  LBB1_31: ## %cond.store43
+; AVX1-NEXT:    vmovhps %xmm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: compressstore_v16f64_v16i1:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllw $7, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovmskb %xmm4, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    jne LBB1_1
+; AVX2-NEXT:  ## %bb.2: ## %else
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    jne LBB1_3
+; AVX2-NEXT:  LBB1_4: ## %else2
+; AVX2-NEXT:    testb $4, %al
+; AVX2-NEXT:    jne LBB1_5
+; AVX2-NEXT:  LBB1_6: ## %else5
+; AVX2-NEXT:    testb $8, %al
+; AVX2-NEXT:    jne LBB1_7
+; AVX2-NEXT:  LBB1_8: ## %else8
+; AVX2-NEXT:    testb $16, %al
+; AVX2-NEXT:    jne LBB1_9
+; AVX2-NEXT:  LBB1_10: ## %else11
+; AVX2-NEXT:    testb $32, %al
+; AVX2-NEXT:    jne LBB1_11
+; AVX2-NEXT:  LBB1_12: ## %else14
+; AVX2-NEXT:    testb $64, %al
+; AVX2-NEXT:    jne LBB1_13
+; AVX2-NEXT:  LBB1_14: ## %else17
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    js LBB1_15
+; AVX2-NEXT:  LBB1_16: ## %else20
+; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
+; AVX2-NEXT:    jne LBB1_17
+; AVX2-NEXT:  LBB1_18: ## %else23
+; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
+; AVX2-NEXT:    jne LBB1_19
+; AVX2-NEXT:  LBB1_20: ## %else26
+; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
+; AVX2-NEXT:    jne LBB1_21
+; AVX2-NEXT:  LBB1_22: ## %else29
+; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
+; AVX2-NEXT:    jne LBB1_23
+; AVX2-NEXT:  LBB1_24: ## %else32
+; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
+; AVX2-NEXT:    jne LBB1_25
+; AVX2-NEXT:  LBB1_26: ## %else35
+; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
+; AVX2-NEXT:    jne LBB1_27
+; AVX2-NEXT:  LBB1_28: ## %else38
+; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
+; AVX2-NEXT:    jne LBB1_29
+; AVX2-NEXT:  LBB1_30: ## %else41
+; AVX2-NEXT:    testl $32768, %eax ## imm = 0x8000
+; AVX2-NEXT:    jne LBB1_31
+; AVX2-NEXT:  LBB1_32: ## %else44
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  LBB1_1: ## %cond.store
+; AVX2-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    je LBB1_4
+; AVX2-NEXT:  LBB1_3: ## %cond.store1
+; AVX2-NEXT:    vmovhps %xmm0, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testb $4, %al
+; AVX2-NEXT:    je LBB1_6
+; AVX2-NEXT:  LBB1_5: ## %cond.store4
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vmovlps %xmm4, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testb $8, %al
+; AVX2-NEXT:    je LBB1_8
+; AVX2-NEXT:  LBB1_7: ## %cond.store7
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testb $16, %al
+; AVX2-NEXT:    je LBB1_10
+; AVX2-NEXT:  LBB1_9: ## %cond.store10
+; AVX2-NEXT:    vmovlps %xmm1, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testb $32, %al
+; AVX2-NEXT:    je LBB1_12
+; AVX2-NEXT:  LBB1_11: ## %cond.store13
+; AVX2-NEXT:    vmovhps %xmm1, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testb $64, %al
+; AVX2-NEXT:    je LBB1_14
+; AVX2-NEXT:  LBB1_13: ## %cond.store16
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    jns LBB1_16
+; AVX2-NEXT:  LBB1_15: ## %cond.store19
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm1[3,2,2,3]
+; AVX2-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
+; AVX2-NEXT:    je LBB1_18
+; AVX2-NEXT:  LBB1_17: ## %cond.store22
+; AVX2-NEXT:    vmovlps %xmm2, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
+; AVX2-NEXT:    je LBB1_20
+; AVX2-NEXT:  LBB1_19: ## %cond.store25
+; AVX2-NEXT:    vmovhps %xmm2, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
+; AVX2-NEXT:    je LBB1_22
+; AVX2-NEXT:  LBB1_21: ## %cond.store28
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
+; AVX2-NEXT:    je LBB1_24
+; AVX2-NEXT:  LBB1_23: ## %cond.store31
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm2[3,2,2,3]
+; AVX2-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
+; AVX2-NEXT:    je LBB1_26
+; AVX2-NEXT:  LBB1_25: ## %cond.store34
+; AVX2-NEXT:    vmovlps %xmm3, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
+; AVX2-NEXT:    je LBB1_28
+; AVX2-NEXT:  LBB1_27: ## %cond.store37
+; AVX2-NEXT:    vmovhps %xmm3, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
+; AVX2-NEXT:    je LBB1_30
+; AVX2-NEXT:  LBB1_29: ## %cond.store40
+; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX2-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX2-NEXT:    addq $8, %rdi
+; AVX2-NEXT:    testl $32768, %eax ## imm = 0x8000
+; AVX2-NEXT:    je LBB1_32
+; AVX2-NEXT:  LBB1_31: ## %cond.store43
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm3[3,2,2,3]
+; AVX2-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: compressstore_v16f64_v16i1:
 ; AVX512F:       ## %bb.0:
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 1289eef7795dc..4cd6bcf6efcb2 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1334,7 +1334,6 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
 ; KNL_64-NEXT:    vmovhps %xmm0, (%rcx)
 ; KNL_64-NEXT:  .LBB18_4: # %else2
 ; KNL_64-NEXT:    testb $4, %al
-; KNL_64-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; KNL_64-NEXT:    jne .LBB18_5
 ; KNL_64-NEXT:  # %bb.6: # %else4
@@ -1344,13 +1343,15 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
 ; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ; KNL_64-NEXT:  .LBB18_5: # %cond.store3
+; KNL_64-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; KNL_64-NEXT:    vmovq %xmm1, %rcx
-; KNL_64-NEXT:    vmovlps %xmm0, (%rcx)
+; KNL_64-NEXT:    vmovlps %xmm2, (%rcx)
 ; KNL_64-NEXT:    testb $8, %al
 ; KNL_64-NEXT:    je .LBB18_8
 ; KNL_64-NEXT:  .LBB18_7: # %cond.store5
+; KNL_64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
 ; KNL_64-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_64-NEXT:    vmovhps %xmm0, (%rax)
+; KNL_64-NEXT:    vmovlps %xmm0, (%rax)
 ; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
@@ -1365,34 +1366,39 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
 ; KNL_32-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; KNL_32-NEXT:    kmovw %k0, %eax
 ; KNL_32-NEXT:    testb $1, %al
-; KNL_32-NEXT:    je .LBB18_2
-; KNL_32-NEXT:  # %bb.1: # %cond.store
-; KNL_32-NEXT:    vmovd %xmm1, %ecx
-; KNL_32-NEXT:    vmovlps %xmm0, (%ecx)
-; KNL_32-NEXT:  .LBB18_2: # %else
+; KNL_32-NEXT:    jne .LBB18_1
+; KNL_32-NEXT:  # %bb.2: # %else
 ; KNL_32-NEXT:    testb $2, %al
-; KNL_32-NEXT:    je .LBB18_4
-; KNL_32-NEXT:  # %bb.3: # %cond.store1
-; KNL_32-NEXT:    vpextrd $1, %xmm1, %ecx
-; KNL_32-NEXT:    vmovhps %xmm0, (%ecx)
+; KNL_32-NEXT:    jne .LBB18_3
 ; KNL_32-NEXT:  .LBB18_4: # %else2
 ; KNL_32-NEXT:    testb $4, %al
-; KNL_32-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; KNL_32-NEXT:    jne .LBB18_5
-; KNL_32-NEXT:  # %bb.6: # %else4
+; KNL_32-NEXT:  .LBB18_6: # %else4
 ; KNL_32-NEXT:    testb $8, %al
 ; KNL_32-NEXT:    jne .LBB18_7
 ; KNL_32-NEXT:  .LBB18_8: # %else6
 ; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
+; KNL_32-NEXT:  .LBB18_1: # %cond.store
+; KNL_32-NEXT:    vmovd %xmm1, %ecx
+; KNL_32-NEXT:    vmovlps %xmm0, (%ecx)
+; KNL_32-NEXT:    testb $2, %al
+; KNL_32-NEXT:    je .LBB18_4
+; KNL_32-NEXT:  .LBB18_3: # %cond.store1
+; KNL_32-NEXT:    vpextrd $1, %xmm1, %ecx
+; KNL_32-NEXT:    vmovhps %xmm0, (%ecx)
+; KNL_32-NEXT:    testb $4, %al
+; KNL_32-NEXT:    je .LBB18_6
 ; KNL_32-NEXT:  .LBB18_5: # %cond.store3
+; KNL_32-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; KNL_32-NEXT:    vpextrd $2, %xmm1, %ecx
-; KNL_32-NEXT:    vmovlps %xmm0, (%ecx)
+; KNL_32-NEXT:    vmovlps %xmm2, (%ecx)
 ; KNL_32-NEXT:    testb $8, %al
 ; KNL_32-NEXT:    je .LBB18_8
 ; KNL_32-NEXT:  .LBB18_7: # %cond.store5
+; KNL_32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
 ; KNL_32-NEXT:    vpextrd $3, %xmm1, %eax
-; KNL_32-NEXT:    vmovhps %xmm0, (%eax)
+; KNL_32-NEXT:    vmovlps %xmm0, (%eax)
 ; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index c7320275091c6..271c7d15e4532 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -4892,18 +4892,32 @@ define void @one_mask_bit_set4(ptr %addr, <4 x double> %val) nounwind {
 ; SSE-NEXT:    movhps %xmm1, 24(%rdi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: one_mask_bit_set4:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovhps %xmm0, 24(%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-LABEL: one_mask_bit_set4:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovhps %xmm0, 24(%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: one_mask_bit_set4:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT:    vmovlps %xmm0, 24(%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: one_mask_bit_set4:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX512-NEXT:    vmovlps %xmm0, 24(%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 ;
 ; X86-AVX512-LABEL: one_mask_bit_set4:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX512-NEXT:    vmovhps %xmm0, 24(%eax)
+; X86-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; X86-AVX512-NEXT:    vmovlps %xmm0, 24(%eax)
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
   call void @llvm.masked.store.v4f64.p0(<4 x double> %val, ptr %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
@@ -4990,8 +5004,7 @@ define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) nounwind {
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
 ; X86-AVX512-NEXT:    vmovlps %xmm0, 48(%eax)
-; X86-AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; X86-AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X86-AVX512-NEXT:    vpermpd {{.*#+}} zmm0 = zmm1[3,2,2,3,7,6,6,7]
 ; X86-AVX512-NEXT:    vmovlps %xmm0, 88(%eax)
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -5934,18 +5947,19 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
 ; AVX2-NEXT:    vpcmpgtd %ymm5, %ymm3, %ymm3
 ; AVX2-NEXT:    vpacksswb %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[0,2,1,3]
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 ; AVX2-NEXT:    vpslld $31, %ymm3, %ymm3
 ; AVX2-NEXT:    vpmaskmovd %ymm0, %ymm3, (%rdx)
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaskmovd %ymm1, %ymm0, 32(%rdx)
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmaskmovd %ymm2, %ymm0, 64(%rdx)
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
-; AVX2-NEXT:    vpmaskmovd %ymm1, %ymm0, 32(%rdx)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 9b8816bd11f70..cf6e0f38dd2f2 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -357,14 +357,14 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
 ;
 ; AVX512F-LABEL: test_mul3x3_f32:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
-; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm3
-; AVX512F-NEXT:    vmulps %xmm3, %xmm0, %xmm3
+; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm2
+; AVX512F-NEXT:    vmulps %xmm2, %xmm0, %xmm3
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX512F-NEXT:    valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
 ; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3]
 ; AVX512F-NEXT:    vmulps %xmm6, %xmm2, %xmm4
 ; AVX512F-NEXT:    vaddps %xmm4, %xmm3, %xmm4
-; AVX512F-NEXT:    vshufpd {{.*#+}} xmm3 = xmm5[1,0]
+; AVX512F-NEXT:    vpermpd {{.*#+}} zmm3 = zmm0[3,3,3,3,7,7,7,7]
 ; AVX512F-NEXT:    vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3]
 ; AVX512F-NEXT:    vshufpd {{.*#+}} xmm8 = xmm1[1,0]
 ; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2]
@@ -416,38 +416,38 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
 ;
 ; AVX512VL-LABEL: test_mul3x3_f32:
 ; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
-; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm3
-; AVX512VL-NEXT:    vmulps %xmm3, %xmm0, %xmm3
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm2
+; AVX512VL-NEXT:    vmulps %xmm2, %xmm0, %xmm2
+; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX512VL-NEXT:    valignd {{.*#+}} zmm4 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
 ; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; AVX512VL-NEXT:    vmulps %xmm5, %xmm2, %xmm6
-; AVX512VL-NEXT:    vaddps %xmm6, %xmm3, %xmm3
-; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm6 = xmm4[1,0]
+; AVX512VL-NEXT:    vmulps %xmm5, %xmm4, %xmm6
+; AVX512VL-NEXT:    vaddps %xmm6, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpermpd {{.*#+}} zmm6 = zmm0[3,3,3,3,7,7,7,7]
 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3]
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm8 = xmm1[1,0]
 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2]
 ; AVX512VL-NEXT:    vmulps %xmm6, %xmm9, %xmm9
-; AVX512VL-NEXT:    vaddps %xmm3, %xmm9, %xmm3
+; AVX512VL-NEXT:    vaddps %xmm2, %xmm9, %xmm2
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm9 = xmm0[1,0]
 ; AVX512VL-NEXT:    vmulss %xmm1, %xmm9, %xmm10
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX512VL-NEXT:    vmulss %xmm5, %xmm4, %xmm5
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX512VL-NEXT:    vmulss %xmm5, %xmm3, %xmm5
 ; AVX512VL-NEXT:    vaddss %xmm5, %xmm10, %xmm5
 ; AVX512VL-NEXT:    vextractf32x4 $2, %zmm0, %xmm10
 ; AVX512VL-NEXT:    vmulss %xmm8, %xmm10, %xmm8
 ; AVX512VL-NEXT:    vaddss %xmm5, %xmm8, %xmm5
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3]
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm5[0],xmm2[3]
 ; AVX512VL-NEXT:    vmulps %xmm7, %xmm0, %xmm5
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm1, %xmm8
 ; AVX512VL-NEXT:    vmovsldup {{.*#+}} xmm11 = xmm8[0,0,2,2]
-; AVX512VL-NEXT:    vmulps %xmm2, %xmm11, %xmm11
+; AVX512VL-NEXT:    vmulps %xmm4, %xmm11, %xmm11
 ; AVX512VL-NEXT:    vaddps %xmm5, %xmm11, %xmm5
 ; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm11 = xmm8[1,1,3,3]
 ; AVX512VL-NEXT:    vmulps %xmm6, %xmm11, %xmm12
 ; AVX512VL-NEXT:    vaddps %xmm5, %xmm12, %xmm5
 ; AVX512VL-NEXT:    vmulss %xmm7, %xmm9, %xmm7
-; AVX512VL-NEXT:    vmulss %xmm4, %xmm8, %xmm12
+; AVX512VL-NEXT:    vmulss %xmm3, %xmm8, %xmm12
 ; AVX512VL-NEXT:    vaddss %xmm7, %xmm12, %xmm7
 ; AVX512VL-NEXT:    vmulss %xmm11, %xmm10, %xmm11
 ; AVX512VL-NEXT:    vaddss %xmm7, %xmm11, %xmm7
@@ -456,19 +456,19 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm11 = xmm8[1,0]
 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2]
 ; AVX512VL-NEXT:    vmulps %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT:    vmulps %xmm7, %xmm2, %xmm2
-; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmulps %xmm7, %xmm4, %xmm4
+; AVX512VL-NEXT:    vaddps %xmm4, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
-; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm2
-; AVX512VL-NEXT:    vmulps %xmm2, %xmm6, %xmm2
-; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmulss %xmm11, %xmm9, %xmm2
-; AVX512VL-NEXT:    vmulss %xmm7, %xmm4, %xmm4
-; AVX512VL-NEXT:    vaddss %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm4
+; AVX512VL-NEXT:    vmulps %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vaddps %xmm4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmulss %xmm11, %xmm9, %xmm4
+; AVX512VL-NEXT:    vmulss %xmm7, %xmm3, %xmm3
+; AVX512VL-NEXT:    vaddss %xmm3, %xmm4, %xmm3
 ; AVX512VL-NEXT:    vmulss %xmm1, %xmm10, %xmm1
-; AVX512VL-NEXT:    vaddss %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT:    vaddss %xmm1, %xmm3, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm3, %zmm2
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
 ; AVX512VL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 9b08d8baacee1..2ac691d547f31 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1223,17 +1223,17 @@ define dso_local void @zext_v16i8_v16i64(<16 x i8> %x, ptr %y) nounwind "min-leg
 ; CHECK-LABEL: zext_v16i8_v16i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vmovdqa %ymm0, (%rdi)
+; CHECK-NEXT:    vmovdqa %ymm3, 32(%rdi)
 ; CHECK-NEXT:    vmovdqa %ymm1, 64(%rdi)
-; CHECK-NEXT:    vmovdqa %ymm3, 96(%rdi)
-; CHECK-NEXT:    vmovdqa %ymm2, 32(%rdi)
+; CHECK-NEXT:    vmovdqa %ymm2, 96(%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %a = zext <16 x i8> %x to <16 x i64>
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 8fd8e0e8120c1..4d0941711ff52 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1464,8 +1464,8 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; AVX1-NEXT:    vmovups (%rdi), %ymm2
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX1-NEXT:    vmovups 16(%rdi), %xmm4
-; AVX1-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],mem[1,0]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
@@ -1568,8 +1568,8 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; XOP-NEXT:    vmovups 32(%rdi), %ymm1
 ; XOP-NEXT:    vmovups (%rdi), %ymm2
 ; XOP-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; XOP-NEXT:    vmovups 16(%rdi), %xmm4
-; XOP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
+; XOP-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; XOP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],mem[1,0]
 ; XOP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
 ; XOP-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
 ; XOP-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index c7cc2acaf2627..b8f5802421a7a 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -1123,15 +1123,26 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
 ; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: mul_v4i64_zero_lower:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm1
-; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX2-LABEL: mul_v4i64_zero_lower:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,2,4,6]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v4i64_zero_lower:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %val1a = zext <4 x i32> %val1 to <4 x i64>
   %val2a = and <4 x i64> %val2, <i64 -4294967296, i64 -4294967296, i64 -4294967296, i64 -4294967296>
@@ -1197,9 +1208,9 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
 ; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; AVX512-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512-NEXT:    retq
 entry:
   %val1a = zext <8 x i32> %val1 to <8 x i64>
diff --git a/llvm/test/CodeGen/X86/pr132844.ll b/llvm/test/CodeGen/X86/pr132844.ll
index ded100b2accce..8b744d245bf17 100644
--- a/llvm/test/CodeGen/X86/pr132844.ll
+++ b/llvm/test/CodeGen/X86/pr132844.ll
@@ -4,11 +4,11 @@
 define  { ptr, i8 } @PR132844(<4 x ptr> %0, <4 x ptr> %1) {
 ; CHECK-LABEL: PR132844:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vinserti64x2 $1, 16, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
+; CHECK-NEXT:    vpermt2q 0, %ymm2, %ymm3
+; CHECK-NEXT:    vpermt2q %ymm0, %ymm2, %ymm1
+; CHECK-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm0
 ; CHECK-NEXT:    vmovdqu %ymm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll
index 2e5c6f047292c..31a1c5a77d88d 100644
--- a/llvm/test/CodeGen/X86/pr29112.ll
+++ b/llvm/test/CodeGen/X86/pr29112.ll
@@ -11,8 +11,6 @@ define <4 x float> @bar(ptr %a1p, ptr %a2p, <4 x float> %a3, <4 x float> %a4, <1
 ; CHECK-NEXT:    subq $136, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 144
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm13
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,20,1,17]
-; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm5
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,1,2,3]
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4,21,1,17,4,21,5,21]
@@ -21,6 +19,8 @@ define <4 x float> @bar(ptr %a1p, ptr %a2p, <4 x float> %a3, <4 x float> %a4, <1
 ; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,20,1,27]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm4
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [3,20,1,17]
+; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm5
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [5,20,1,19,5,20,5,23]
 ; CHECK-NEXT:    vpermi2ps %zmm3, %zmm2, %zmm7
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4,20,1,19,4,20,5,23]
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 2d1b7fcbf0239..ed09d8adf1bc8 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -28,15 +28,16 @@ define i64 @PR62286(i32 %a) {
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR62286:
diff --git a/llvm/test/CodeGen/X86/pr95278.ll b/llvm/test/CodeGen/X86/pr95278.ll
index 104fc04d68cdb..32b507d1a46a2 100644
--- a/llvm/test/CodeGen/X86/pr95278.ll
+++ b/llvm/test/CodeGen/X86/pr95278.ll
@@ -5,8 +5,8 @@ define void @PR95278(ptr %p0, ptr %p1) {
 ; CHECK-LABEL: PR95278:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvtph2ps 2016(%rdi), %zmm0
-; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,6]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpextrw $0, %xmm0, (%rsi)
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/pr97968.ll b/llvm/test/CodeGen/X86/pr97968.ll
index a539a33e9a281..ca5c63cdc1c2e 100644
--- a/llvm/test/CodeGen/X86/pr97968.ll
+++ b/llvm/test/CodeGen/X86/pr97968.ll
@@ -5,8 +5,8 @@ define <2 x i32> @PR97968(<16 x i32> %a0) {
 ; CHECK-LABEL: PR97968:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,7,2,7]
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %sub0 = shufflevector <16 x i32> %a0, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index f7dd1dc0949f5..91dcba40fcf5a 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -8,19 +8,21 @@
 define <8 x i64> @shl_i512_1(<8 x i64> %a)  {
 ; AVX512VL-LABEL: shl_i512_1:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm0[3,4,5,6,7,0,1,2]
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512VL-NEXT:    vpsllq $1, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
 ; AVX512VL-NEXT:    vpsrlq $63, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpaddq %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpor %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT:    vpaddq %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpaddq %xmm2, %xmm2, %xmm5
+; AVX512VL-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
 ; AVX512VL-NEXT:    vpsrlq $63, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT:    vpaddq %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512VL-NEXT:    vpsrlq $63, %zmm0, %zmm2
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
 ; AVX512VL-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index 26af46263c0e2..5c61622b4f922 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -965,24 +965,21 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
 ;
 ; AVX512F-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm1
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT:    vpextrw $6, %xmm2, %eax
-; AVX512F-NEXT:    vpextrw $4, %xmm2, %ecx
-; AVX512F-NEXT:    vpextrw $2, %xmm2, %edx
-; AVX512F-NEXT:    vmovd %xmm2, %esi
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512F-NEXT:    vpextrw $6, %xmm2, %edi
-; AVX512F-NEXT:    vpextrw $4, %xmm2, %r8d
-; AVX512F-NEXT:    vpextrw $2, %xmm2, %r9d
-; AVX512F-NEXT:    vmovd %xmm2, %r10d
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vpextrw $6, %xmm0, %r11d
-; AVX512F-NEXT:    vpextrw $4, %xmm0, %ebx
-; AVX512F-NEXT:    vpextrw $2, %xmm0, %ebp
-; AVX512F-NEXT:    vpinsrb $5, %ebp, %xmm1, %xmm0
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512F-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512F-NEXT:    vpextrw $4, %xmm1, %ecx
+; AVX512F-NEXT:    vpextrw $2, %xmm1, %edx
+; AVX512F-NEXT:    vmovd %xmm1, %esi
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-NEXT:    vpextrw $6, %xmm1, %edi
+; AVX512F-NEXT:    vpextrw $4, %xmm1, %r8d
+; AVX512F-NEXT:    vpextrw $2, %xmm1, %r9d
+; AVX512F-NEXT:    vmovd %xmm1, %r10d
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpextrw $6, %xmm1, %r11d
+; AVX512F-NEXT:    vpextrw $4, %xmm1, %ebx
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
@@ -994,7 +991,6 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
 ; AVX512F-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1039,24 +1035,21 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
 ;
 ; AVX512BW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
 ; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrw $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrw $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrw $2, %xmm2, %edx
-; AVX512BW-NEXT:    vmovd %xmm2, %esi
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrw $6, %xmm2, %edi
-; AVX512BW-NEXT:    vpextrw $4, %xmm2, %r8d
-; AVX512BW-NEXT:    vpextrw $2, %xmm2, %r9d
-; AVX512BW-NEXT:    vmovd %xmm2, %r10d
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vpextrw $6, %xmm0, %r11d
-; AVX512BW-NEXT:    vpextrw $4, %xmm0, %ebx
-; AVX512BW-NEXT:    vpextrw $2, %xmm0, %ebp
-; AVX512BW-NEXT:    vpinsrb $5, %ebp, %xmm1, %xmm0
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512BW-NEXT:    vpextrw $4, %xmm1, %ecx
+; AVX512BW-NEXT:    vpextrw $2, %xmm1, %edx
+; AVX512BW-NEXT:    vmovd %xmm1, %esi
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512BW-NEXT:    vpextrw $6, %xmm1, %edi
+; AVX512BW-NEXT:    vpextrw $4, %xmm1, %r8d
+; AVX512BW-NEXT:    vpextrw $2, %xmm1, %r9d
+; AVX512BW-NEXT:    vmovd %xmm1, %r10d
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpextrw $6, %xmm1, %r11d
+; AVX512BW-NEXT:    vpextrw $4, %xmm1, %ebx
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
@@ -1068,74 +1061,47 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
 ; AVX512BW-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %rbp
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
-; AVX512BWVL-ONLY-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512BWVL-ONLY:       # %bb.0:
-; AVX512BWVL-ONLY-NEXT:    pushq %rbp
-; AVX512BWVL-ONLY-NEXT:    pushq %r14
-; AVX512BWVL-ONLY-NEXT:    pushq %rbx
-; AVX512BWVL-ONLY-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $6, %xmm1, %eax
-; AVX512BWVL-ONLY-NEXT:    vpextrw $4, %xmm1, %ecx
-; AVX512BWVL-ONLY-NEXT:    vpextrw $2, %xmm1, %edx
-; AVX512BWVL-ONLY-NEXT:    vmovd %xmm1, %esi
-; AVX512BWVL-ONLY-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $6, %xmm1, %edi
-; AVX512BWVL-ONLY-NEXT:    vpextrw $4, %xmm1, %r8d
-; AVX512BWVL-ONLY-NEXT:    vpextrw $2, %xmm1, %r9d
-; AVX512BWVL-ONLY-NEXT:    vmovd %xmm1, %r10d
-; AVX512BWVL-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $6, %xmm1, %r11d
-; AVX512BWVL-ONLY-NEXT:    vpextrw $4, %xmm1, %ebx
-; AVX512BWVL-ONLY-NEXT:    vpextrw $2, %xmm1, %ebp
-; AVX512BWVL-ONLY-NEXT:    vmovd %xmm1, %r14d
-; AVX512BWVL-ONLY-NEXT:    vpmovdb %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    popq %rbx
-; AVX512BWVL-ONLY-NEXT:    popq %r14
-; AVX512BWVL-ONLY-NEXT:    popq %rbp
-; AVX512BWVL-ONLY-NEXT:    vzeroupper
-; AVX512BWVL-ONLY-NEXT:    retq
-;
-; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-FAST:       # %bb.0:
-; AVX512VBMI-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79]
-; AVX512VBMI-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-FAST-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1
-; AVX512VBMI-FAST-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
-; AVX512VBMI-FAST-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX512VBMI-FAST-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
-; AVX512VBMI-FAST-NEXT:    vzeroupper
-; AVX512VBMI-FAST-NEXT:    retq
-;
-; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-SLOW:       # %bb.0:
-; AVX512VBMI-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79]
-; AVX512VBMI-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-SLOW-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1
-; AVX512VBMI-SLOW-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX512VBMI-SLOW-NEXT:    vpextrw $4, %xmm0, %ecx
-; AVX512VBMI-SLOW-NEXT:    vpextrw $2, %xmm0, %edx
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $13, %edx, %xmm1, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vzeroupper
-; AVX512VBMI-SLOW-NEXT:    retq
+; AVX512BWVL-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    pushq %rbp
+; AVX512BWVL-NEXT:    pushq %r14
+; AVX512BWVL-NEXT:    pushq %rbx
+; AVX512BWVL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BWVL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm1, %ecx
+; AVX512BWVL-NEXT:    vpextrw $2, %xmm1, %edx
+; AVX512BWVL-NEXT:    vmovd %xmm1, %esi
+; AVX512BWVL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512BWVL-NEXT:    vpextrw $6, %xmm1, %edi
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm1, %r8d
+; AVX512BWVL-NEXT:    vpextrw $2, %xmm1, %r9d
+; AVX512BWVL-NEXT:    vmovd %xmm1, %r10d
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpextrw $6, %xmm1, %r11d
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm1, %ebx
+; AVX512BWVL-NEXT:    vpextrw $2, %xmm1, %ebp
+; AVX512BWVL-NEXT:    vmovd %xmm1, %r14d
+; AVX512BWVL-NEXT:    vpmovdb %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    popq %rbx
+; AVX512BWVL-NEXT:    popq %r14
+; AVX512BWVL-NEXT:    popq %rbp
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
   %n0 = shufflevector <32 x i16> %n2, <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   %n1 = trunc <16 x i16> %n0 to <16 x i8>
   ret <16 x i8> %n1
@@ -1304,186 +1270,50 @@ define <16 x i8> @oddelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512F-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512F-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX512F-NEXT:    vpextrw $3, %xmm1, %edx
-; AVX512F-NEXT:    vpextrw $1, %xmm1, %esi
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX512F-NEXT:    vpextrw $5, %xmm1, %r8d
-; AVX512F-NEXT:    vpextrw $3, %xmm1, %r9d
-; AVX512F-NEXT:    vpextrw $1, %xmm1, %r10d
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpextrw $7, %xmm1, %r11d
-; AVX512F-NEXT:    vpextrw $5, %xmm1, %ebx
-; AVX512F-NEXT:    vpextrw $3, %xmm1, %ebp
-; AVX512F-NEXT:    vpextrw $1, %xmm1, %r14d
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %rbp
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rbp
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512VL-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX512VL-NEXT:    vpextrw $3, %xmm1, %edx
-; AVX512VL-NEXT:    vpextrw $1, %xmm1, %esi
-; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX512VL-NEXT:    vpextrw $5, %xmm1, %r8d
-; AVX512VL-NEXT:    vpextrw $3, %xmm1, %r9d
-; AVX512VL-NEXT:    vpextrw $1, %xmm1, %r10d
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $7, %xmm1, %r11d
-; AVX512VL-NEXT:    vpextrw $5, %xmm1, %ebx
-; AVX512VL-NEXT:    vpextrw $3, %xmm1, %ebp
-; AVX512VL-NEXT:    vpextrw $1, %xmm1, %r14d
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %rbp
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512BW-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrw $3, %xmm1, %edx
-; AVX512BW-NEXT:    vpextrw $1, %xmm1, %esi
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX512BW-NEXT:    vpextrw $5, %xmm1, %r8d
-; AVX512BW-NEXT:    vpextrw $3, %xmm1, %r9d
-; AVX512BW-NEXT:    vpextrw $1, %xmm1, %r10d
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vpextrw $7, %xmm1, %r11d
-; AVX512BW-NEXT:    vpextrw $5, %xmm1, %ebx
-; AVX512BW-NEXT:    vpextrw $3, %xmm1, %ebp
-; AVX512BW-NEXT:    vpextrw $1, %xmm1, %r14d
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-ONLY-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512BWVL-ONLY:       # %bb.0:
-; AVX512BWVL-ONLY-NEXT:    pushq %rbp
-; AVX512BWVL-ONLY-NEXT:    pushq %r14
-; AVX512BWVL-ONLY-NEXT:    pushq %rbx
-; AVX512BWVL-ONLY-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512BWVL-ONLY-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX512BWVL-ONLY-NEXT:    vpextrw $3, %xmm1, %edx
-; AVX512BWVL-ONLY-NEXT:    vpextrw $1, %xmm1, %esi
-; AVX512BWVL-ONLY-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX512BWVL-ONLY-NEXT:    vpextrw $5, %xmm1, %r8d
-; AVX512BWVL-ONLY-NEXT:    vpextrw $3, %xmm1, %r9d
-; AVX512BWVL-ONLY-NEXT:    vpextrw $1, %xmm1, %r10d
-; AVX512BWVL-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $7, %xmm1, %r11d
-; AVX512BWVL-ONLY-NEXT:    vpextrw $5, %xmm1, %ebx
-; AVX512BWVL-ONLY-NEXT:    vpextrw $3, %xmm1, %ebp
-; AVX512BWVL-ONLY-NEXT:    vpextrw $1, %xmm1, %r14d
-; AVX512BWVL-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    popq %rbx
-; AVX512BWVL-ONLY-NEXT:    popq %r14
-; AVX512BWVL-ONLY-NEXT:    popq %rbp
-; AVX512BWVL-ONLY-NEXT:    vzeroupper
-; AVX512BWVL-ONLY-NEXT:    retq
-;
-; AVX512VBMI-FAST-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-FAST:       # %bb.0:
-; AVX512VBMI-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
-; AVX512VBMI-FAST-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI-FAST-NEXT:    vzeroupper
-; AVX512VBMI-FAST-NEXT:    retq
-;
-; AVX512VBMI-SLOW-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-SLOW:       # %bb.0:
-; AVX512VBMI-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,6,10,14,18,22,26,30,34,38,42,46,50,u,u,u]
-; AVX512VBMI-SLOW-NEXT:    vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-SLOW-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpextrw $7, %xmm0, %eax
-; AVX512VBMI-SLOW-NEXT:    vpextrw $5, %xmm0, %ecx
-; AVX512VBMI-SLOW-NEXT:    vpextrw $3, %xmm0, %edx
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $13, %edx, %xmm1, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vzeroupper
-; AVX512VBMI-SLOW-NEXT:    retq
+; AVX512-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    vpextrw $5, %xmm1, %ecx
+; AVX512-NEXT:    vpextrw $3, %xmm1, %edx
+; AVX512-NEXT:    vpextrw $1, %xmm1, %esi
+; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512-NEXT:    vpextrw $7, %xmm1, %edi
+; AVX512-NEXT:    vpextrw $5, %xmm1, %r8d
+; AVX512-NEXT:    vpextrw $3, %xmm1, %r9d
+; AVX512-NEXT:    vpextrw $1, %xmm1, %r10d
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpextrw $7, %xmm1, %r11d
+; AVX512-NEXT:    vpextrw $5, %xmm1, %ebx
+; AVX512-NEXT:    vpextrw $3, %xmm1, %ebp
+; AVX512-NEXT:    vpextrw $1, %xmm1, %r14d
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %n0 = shufflevector <32 x i16> %n2, <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   %n1 = trunc <16 x i16> %n0 to <16 x i8>
   ret <16 x i8> %n1
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512BWVL-ONLY: {{.*}}
 ; AVX512VBMI: {{.*}}
+; AVX512VBMI-FAST: {{.*}}
+; AVX512VBMI-SLOW: {{.*}}
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index ec442c185706c..78d3914caff0d 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -406,26 +406,26 @@ define <4 x double> @PR34175(ptr %p) {
 ;
 ; AVX512BW-LABEL: PR34175:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512BW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [32,1,40,3,48,5,56,7]
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpermt2w (%rdi), %zmm0, %zmm1
+; AVX512BW-NEXT:    vcvtdq2pd %xmm1, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: PR34175:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512BWVL-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BWVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [32,1,40,3,48,5,56,7]
+; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpermt2w (%rdi), %zmm0, %zmm1
+; AVX512BWVL-NEXT:    vcvtdq2pd %xmm1, %ymm0
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: PR34175:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512VBMI-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512VBMI-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512VBMI-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX512VBMI-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [32,1,40,3,48,5,56,7]
+; AVX512VBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vpermt2w (%rdi), %zmm0, %zmm1
+; AVX512VBMI-NEXT:    vcvtdq2pd %xmm1, %ymm0
 ; AVX512VBMI-NEXT:    retq
   %v = load <32 x i16>, ptr %p, align 2
   %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll
index 9db2f6ba1e810..9df844214975c 100644
--- a/llvm/test/CodeGen/X86/trunc-subvector.ll
+++ b/llvm/test/CodeGen/X86/trunc-subvector.ll
@@ -88,12 +88,9 @@ define <2 x i32> @test5(<8 x i32> %v) {
 ; AVX512-LABEL: test5:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rcx
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,8,0,0]
+; AVX512-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %x = sext <8 x i32> %v to <8 x i64>
@@ -185,13 +182,9 @@ define <2 x i32> @test10(<8 x i32> %v) {
 ;
 ; AVX512-LABEL: test10:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rcx
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    valignd {{.*#+}} zmm0 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %x = zext <8 x i32> %v to <8 x i64>
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
index 179e8ad69672b..995e574af44d5 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
@@ -101,9 +101,10 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-32-NEXT:    subl $32, %esp
 ; AVX512F-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    vmovhps %xmm0, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512F-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    vmovhps %xmm0, (%esp)
+; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX512F-32-NEXT:    vmovlps %xmm0, (%esp)
 ; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
@@ -131,11 +132,11 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512F-64-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX512F-64-NEXT:    vmovq %rax, %xmm2
-; AVX512F-64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-64-NEXT:    vcvttsd2si %xmm1, %rax
 ; AVX512F-64-NEXT:    vmovq %rax, %xmm1
-; AVX512F-64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-64-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512F-64-NEXT:    vcvttsd2si %xmm2, %rax
+; AVX512F-64-NEXT:    vmovq %rax, %xmm2
+; AVX512F-64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-64-NEXT:    vcvttsd2si %xmm0, %rax
 ; AVX512F-64-NEXT:    vmovq %rax, %xmm2
 ; AVX512F-64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -156,9 +157,10 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    subl $32, %esp
 ; AVX512VL-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    vmovhps %xmm0, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    vmovhps %xmm0, (%esp)
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX512VL-32-NEXT:    vmovlps %xmm0, (%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
@@ -186,11 +188,11 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512VL-64-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX512VL-64-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-64-NEXT:    vcvttsd2si %xmm1, %rax
 ; AVX512VL-64-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-64-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512VL-64-NEXT:    vcvttsd2si %xmm2, %rax
+; AVX512VL-64-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-64-NEXT:    vcvttsd2si %xmm0, %rax
 ; AVX512VL-64-NEXT:    vmovq %rax, %xmm2
 ; AVX512VL-64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -377,16 +379,16 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-32-NEXT:    andl $-8, %esp
 ; AVX512F-32-NEXT:    subl $40, %esp
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX512F-32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512F-32-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
 ; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX512F-32-NEXT:    xorl %eax, %eax
-; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm3
+; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512F-32-NEXT:    setae %al
 ; AVX512F-32-NEXT:    kmovw %eax, %k1
-; AVX512F-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
-; AVX512F-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovsd %xmm3, (%esp)
+; AVX512F-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vmovsd %xmm2, (%esp)
+; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512F-32-NEXT:    xorl %edx, %edx
 ; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512F-32-NEXT:    setae %dl
@@ -445,11 +447,11 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512F-64-NEXT:    vcvttsd2usi %xmm1, %rax
-; AVX512F-64-NEXT:    vmovq %rax, %xmm2
-; AVX512F-64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-64-NEXT:    vcvttsd2usi %xmm1, %rax
 ; AVX512F-64-NEXT:    vmovq %rax, %xmm1
-; AVX512F-64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-64-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512F-64-NEXT:    vcvttsd2usi %xmm2, %rax
+; AVX512F-64-NEXT:    vmovq %rax, %xmm2
+; AVX512F-64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-64-NEXT:    vcvttsd2usi %xmm0, %rax
 ; AVX512F-64-NEXT:    vmovq %rax, %xmm2
 ; AVX512F-64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -470,16 +472,16 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    andl $-8, %esp
 ; AVX512VL-32-NEXT:    subl $40, %esp
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512VL-32-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
 ; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm3
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
-; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovsd %xmm3, (%esp)
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovsd %xmm2, (%esp)
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512VL-32-NEXT:    xorl %edx, %edx
 ; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %dl
@@ -538,11 +540,11 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512VL-64-NEXT:    vcvttsd2usi %xmm1, %rax
-; AVX512VL-64-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-64-NEXT:    vcvttsd2usi %xmm1, %rax
 ; AVX512VL-64-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-64-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512VL-64-NEXT:    vcvttsd2usi %xmm2, %rax
+; AVX512VL-64-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-64-NEXT:    vcvttsd2usi %xmm0, %rax
 ; AVX512VL-64-NEXT:    vmovq %rax, %xmm2
 ; AVX512VL-64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
index ce5db5b246775..f4ed4405b1770 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
@@ -46,9 +46,10 @@ define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovhps %xmm1, (%esp)
 ; AVX512VL-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    vmovhps %xmm0, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512VL-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    vmovhps %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
@@ -107,12 +108,12 @@ define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-64-NEXT:    vmovq %rax, %xmm2
 ; AVX512VL-64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512VL-64-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VL-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-64-NEXT:    vcvttsd2si %xmm2, %rax
-; AVX512VL-64-NEXT:    vmovq %rax, %xmm3
-; AVX512VL-64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512VL-64-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512VL-64-NEXT:    vcvttsd2si %xmm2, %rax
 ; AVX512VL-64-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX512VL-64-NEXT:    vcvttsd2si %xmm3, %rax
+; AVX512VL-64-NEXT:    vmovq %rax, %xmm3
 ; AVX512VL-64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512VL-64-NEXT:    vcvttsd2si %xmm0, %rax
 ; AVX512VL-64-NEXT:    vmovq %rax, %xmm3
@@ -184,16 +185,16 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
 ; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512VL-32-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
-; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm3
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
-; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512VL-32-NEXT:    xorl %ecx, %ecx
 ; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %cl
@@ -296,12 +297,12 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-64-NEXT:    vmovq %rax, %xmm2
 ; AVX512VL-64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512VL-64-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VL-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-64-NEXT:    vcvttsd2usi %xmm2, %rax
-; AVX512VL-64-NEXT:    vmovq %rax, %xmm3
-; AVX512VL-64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512VL-64-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512VL-64-NEXT:    vcvttsd2usi %xmm2, %rax
 ; AVX512VL-64-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX512VL-64-NEXT:    vcvttsd2usi %xmm3, %rax
+; AVX512VL-64-NEXT:    vmovq %rax, %xmm3
 ; AVX512VL-64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512VL-64-NEXT:    vcvttsd2usi %xmm0, %rax
 ; AVX512VL-64-NEXT:    vmovq %rax, %xmm3
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
index a336d0a01fa7b..84ade9e23ce03 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
@@ -631,40 +631,40 @@ define <4 x double> @uitofp_v4i32_v4f64(<4 x i32> %x) #0 {
 }
 
 define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
-; AVX-32-LABEL: sitofp_v4i64_v4f64:
-; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    pushl %ebp
-; AVX-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX-32-NEXT:    .cfi_offset %ebp, -8
-; AVX-32-NEXT:    movl %esp, %ebp
-; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
-; AVX-32-NEXT:    andl $-8, %esp
-; AVX-32-NEXT:    subl $64, %esp
-; AVX-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fstpl (%esp)
-; AVX-32-NEXT:    wait
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-32-NEXT:    movl %ebp, %esp
-; AVX-32-NEXT:    popl %ebp
-; AVX-32-NEXT:    .cfi_def_cfa %esp, 4
-; AVX-32-NEXT:    retl
+; AVX1-32-LABEL: sitofp_v4i64_v4f64:
+; AVX1-32:       # %bb.0:
+; AVX1-32-NEXT:    pushl %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-32-NEXT:    .cfi_offset %ebp, -8
+; AVX1-32-NEXT:    movl %esp, %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX1-32-NEXT:    andl $-8, %esp
+; AVX1-32-NEXT:    subl $64, %esp
+; AVX1-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fstpl (%esp)
+; AVX1-32-NEXT:    wait
+; AVX1-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX1-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-32-NEXT:    movl %ebp, %esp
+; AVX1-32-NEXT:    popl %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX1-32-NEXT:    retl
 ;
 ; AVX1-64-LABEL: sitofp_v4i64_v4f64:
 ; AVX1-64:       # %bb.0:
@@ -682,6 +682,41 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX1-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-64-NEXT:    retq
 ;
+; AVX2-32-LABEL: sitofp_v4i64_v4f64:
+; AVX2-32:       # %bb.0:
+; AVX2-32-NEXT:    pushl %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-32-NEXT:    .cfi_offset %ebp, -8
+; AVX2-32-NEXT:    movl %esp, %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX2-32-NEXT:    andl $-8, %esp
+; AVX2-32-NEXT:    subl $64, %esp
+; AVX2-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fstpl (%esp)
+; AVX2-32-NEXT:    wait
+; AVX2-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX2-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX2-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-32-NEXT:    movl %ebp, %esp
+; AVX2-32-NEXT:    popl %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX2-32-NEXT:    retl
+;
 ; AVX2-64-LABEL: sitofp_v4i64_v4f64:
 ; AVX2-64:       # %bb.0:
 ; AVX2-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -698,6 +733,41 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX2-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-64-NEXT:    retq
 ;
+; AVX512F-32-LABEL: sitofp_v4i64_v4f64:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    pushl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512F-32-NEXT:    movl %esp, %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT:    andl $-8, %esp
+; AVX512F-32-NEXT:    subl $64, %esp
+; AVX512F-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512F-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fstpl (%esp)
+; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX512F-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    movl %ebp, %esp
+; AVX512F-32-NEXT:    popl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT:    retl
+;
 ; AVX512F-64-LABEL: sitofp_v4i64_v4f64:
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -714,6 +784,41 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX512F-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-64-NEXT:    retq
 ;
+; AVX512VL-32-LABEL: sitofp_v4i64_v4f64:
+; AVX512VL-32:       # %bb.0:
+; AVX512VL-32-NEXT:    pushl %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512VL-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512VL-32-NEXT:    movl %esp, %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512VL-32-NEXT:    andl $-8, %esp
+; AVX512VL-32-NEXT:    subl $64, %esp
+; AVX512VL-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512VL-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512VL-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fstpl (%esp)
+; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512VL-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512VL-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX512VL-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-32-NEXT:    movl %ebp, %esp
+; AVX512VL-32-NEXT:    popl %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512VL-32-NEXT:    retl
+;
 ; AVX512VL-64-LABEL: sitofp_v4i64_v4f64:
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -748,55 +853,55 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 }
 
 define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
-; AVX-32-LABEL: uitofp_v4i64_v4f64:
-; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    pushl %ebp
-; AVX-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX-32-NEXT:    .cfi_offset %ebp, -8
-; AVX-32-NEXT:    movl %esp, %ebp
-; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
-; AVX-32-NEXT:    andl $-8, %esp
-; AVX-32-NEXT:    subl $64, %esp
-; AVX-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vextractps $1, %xmm0, %eax
-; AVX-32-NEXT:    shrl $31, %eax
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT:    fstpl (%esp)
-; AVX-32-NEXT:    wait
-; AVX-32-NEXT:    vextractps $3, %xmm0, %eax
-; AVX-32-NEXT:    shrl $31, %eax
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT:    fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    wait
-; AVX-32-NEXT:    vextractps $1, %xmm1, %eax
-; AVX-32-NEXT:    shrl $31, %eax
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT:    fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    wait
-; AVX-32-NEXT:    vextractps $3, %xmm1, %eax
-; AVX-32-NEXT:    shrl $31, %eax
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT:    fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    wait
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-32-NEXT:    movl %ebp, %esp
-; AVX-32-NEXT:    popl %ebp
-; AVX-32-NEXT:    .cfi_def_cfa %esp, 4
-; AVX-32-NEXT:    retl
+; AVX1-32-LABEL: uitofp_v4i64_v4f64:
+; AVX1-32:       # %bb.0:
+; AVX1-32-NEXT:    pushl %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-32-NEXT:    .cfi_offset %ebp, -8
+; AVX1-32-NEXT:    movl %esp, %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX1-32-NEXT:    andl $-8, %esp
+; AVX1-32-NEXT:    subl $64, %esp
+; AVX1-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX1-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vextractps $1, %xmm0, %eax
+; AVX1-32-NEXT:    shrl $31, %eax
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT:    fstpl (%esp)
+; AVX1-32-NEXT:    wait
+; AVX1-32-NEXT:    vextractps $3, %xmm0, %eax
+; AVX1-32-NEXT:    shrl $31, %eax
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    wait
+; AVX1-32-NEXT:    vextractps $1, %xmm1, %eax
+; AVX1-32-NEXT:    shrl $31, %eax
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    wait
+; AVX1-32-NEXT:    vextractps $3, %xmm1, %eax
+; AVX1-32-NEXT:    shrl $31, %eax
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    wait
+; AVX1-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX1-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-32-NEXT:    movl %ebp, %esp
+; AVX1-32-NEXT:    popl %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX1-32-NEXT:    retl
 ;
 ; AVX1-64-LABEL: uitofp_v4i64_v4f64:
 ; AVX1-64:       # %bb.0:
@@ -828,6 +933,56 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX1-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; AVX1-64-NEXT:    retq
 ;
+; AVX2-32-LABEL: uitofp_v4i64_v4f64:
+; AVX2-32:       # %bb.0:
+; AVX2-32-NEXT:    pushl %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-32-NEXT:    .cfi_offset %ebp, -8
+; AVX2-32-NEXT:    movl %esp, %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX2-32-NEXT:    andl $-8, %esp
+; AVX2-32-NEXT:    subl $64, %esp
+; AVX2-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX2-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vextractps $1, %xmm0, %eax
+; AVX2-32-NEXT:    shrl $31, %eax
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT:    fstpl (%esp)
+; AVX2-32-NEXT:    wait
+; AVX2-32-NEXT:    vextractps $3, %xmm0, %eax
+; AVX2-32-NEXT:    shrl $31, %eax
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    wait
+; AVX2-32-NEXT:    vextractps $1, %xmm1, %eax
+; AVX2-32-NEXT:    shrl $31, %eax
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    wait
+; AVX2-32-NEXT:    vextractps $3, %xmm1, %eax
+; AVX2-32-NEXT:    shrl $31, %eax
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    wait
+; AVX2-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX2-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX2-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-32-NEXT:    movl %ebp, %esp
+; AVX2-32-NEXT:    popl %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX2-32-NEXT:    retl
+;
 ; AVX2-64-LABEL: uitofp_v4i64_v4f64:
 ; AVX2-64:       # %bb.0:
 ; AVX2-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -859,6 +1014,56 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX2-64-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
 ; AVX2-64-NEXT:    retq
 ;
+; AVX512F-32-LABEL: uitofp_v4i64_v4f64:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    pushl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512F-32-NEXT:    movl %esp, %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT:    andl $-8, %esp
+; AVX512F-32-NEXT:    subl $64, %esp
+; AVX512F-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX512F-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vextractps $1, %xmm0, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT:    fstpl (%esp)
+; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vextractps $3, %xmm0, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vextractps $1, %xmm1, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vextractps $3, %xmm1, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX512F-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    movl %ebp, %esp
+; AVX512F-32-NEXT:    popl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT:    retl
+;
 ; AVX512F-64-LABEL: uitofp_v4i64_v4f64:
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -875,6 +1080,56 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX512F-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-64-NEXT:    retq
 ;
+; AVX512VL-32-LABEL: uitofp_v4i64_v4f64:
+; AVX512VL-32:       # %bb.0:
+; AVX512VL-32-NEXT:    pushl %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512VL-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512VL-32-NEXT:    movl %esp, %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512VL-32-NEXT:    andl $-8, %esp
+; AVX512VL-32-NEXT:    subl $64, %esp
+; AVX512VL-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512VL-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX512VL-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vextractps $1, %xmm0, %eax
+; AVX512VL-32-NEXT:    shrl $31, %eax
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT:    fstpl (%esp)
+; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vextractps $3, %xmm0, %eax
+; AVX512VL-32-NEXT:    shrl $31, %eax
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vextractps $1, %xmm1, %eax
+; AVX512VL-32-NEXT:    shrl $31, %eax
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vextractps $3, %xmm1, %eax
+; AVX512VL-32-NEXT:    shrl $31, %eax
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512VL-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512VL-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX512VL-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-32-NEXT:    movl %ebp, %esp
+; AVX512VL-32-NEXT:    popl %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512VL-32-NEXT:    retl
+;
 ; AVX512VL-64-LABEL: uitofp_v4i64_v4f64:
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -909,40 +1164,40 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 }
 
 define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
-; AVX-32-LABEL: sitofp_v4i64_v4f32:
-; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    pushl %ebp
-; AVX-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX-32-NEXT:    .cfi_offset %ebp, -8
-; AVX-32-NEXT:    movl %esp, %ebp
-; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
-; AVX-32-NEXT:    andl $-8, %esp
-; AVX-32-NEXT:    subl $48, %esp
-; AVX-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fstps (%esp)
-; AVX-32-NEXT:    wait
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; AVX-32-NEXT:    movl %ebp, %esp
-; AVX-32-NEXT:    popl %ebp
-; AVX-32-NEXT:    .cfi_def_cfa %esp, 4
-; AVX-32-NEXT:    vzeroupper
-; AVX-32-NEXT:    retl
+; AVX1-32-LABEL: sitofp_v4i64_v4f32:
+; AVX1-32:       # %bb.0:
+; AVX1-32-NEXT:    pushl %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-32-NEXT:    .cfi_offset %ebp, -8
+; AVX1-32-NEXT:    movl %esp, %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX1-32-NEXT:    andl $-8, %esp
+; AVX1-32-NEXT:    subl $48, %esp
+; AVX1-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fstps (%esp)
+; AVX1-32-NEXT:    wait
+; AVX1-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX1-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX1-32-NEXT:    movl %ebp, %esp
+; AVX1-32-NEXT:    popl %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX1-32-NEXT:    vzeroupper
+; AVX1-32-NEXT:    retl
 ;
 ; AVX1-64-LABEL: sitofp_v4i64_v4f32:
 ; AVX1-64:       # %bb.0:
@@ -961,6 +1216,41 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX1-64-NEXT:    vzeroupper
 ; AVX1-64-NEXT:    retq
 ;
+; AVX2-32-LABEL: sitofp_v4i64_v4f32:
+; AVX2-32:       # %bb.0:
+; AVX2-32-NEXT:    pushl %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-32-NEXT:    .cfi_offset %ebp, -8
+; AVX2-32-NEXT:    movl %esp, %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX2-32-NEXT:    andl $-8, %esp
+; AVX2-32-NEXT:    subl $48, %esp
+; AVX2-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fstps (%esp)
+; AVX2-32-NEXT:    wait
+; AVX2-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX2-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX2-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX2-32-NEXT:    movl %ebp, %esp
+; AVX2-32-NEXT:    popl %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX2-32-NEXT:    vzeroupper
+; AVX2-32-NEXT:    retl
+;
 ; AVX2-64-LABEL: sitofp_v4i64_v4f32:
 ; AVX2-64:       # %bb.0:
 ; AVX2-64-NEXT:    vpextrq $1, %xmm0, %rax
@@ -978,6 +1268,41 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX2-64-NEXT:    vzeroupper
 ; AVX2-64-NEXT:    retq
 ;
+; AVX512F-32-LABEL: sitofp_v4i64_v4f32:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    pushl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512F-32-NEXT:    movl %esp, %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT:    andl $-8, %esp
+; AVX512F-32-NEXT:    subl $48, %esp
+; AVX512F-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512F-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fstps (%esp)
+; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX512F-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX512F-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX512F-32-NEXT:    movl %ebp, %esp
+; AVX512F-32-NEXT:    popl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT:    vzeroupper
+; AVX512F-32-NEXT:    retl
+;
 ; AVX512F-64-LABEL: sitofp_v4i64_v4f32:
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm0, %rax
@@ -995,6 +1320,41 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX512F-64-NEXT:    vzeroupper
 ; AVX512F-64-NEXT:    retq
 ;
+; AVX512VL-32-LABEL: sitofp_v4i64_v4f32:
+; AVX512VL-32:       # %bb.0:
+; AVX512VL-32-NEXT:    pushl %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512VL-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512VL-32-NEXT:    movl %esp, %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512VL-32-NEXT:    andl $-8, %esp
+; AVX512VL-32-NEXT:    subl $48, %esp
+; AVX512VL-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512VL-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512VL-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fstps (%esp)
+; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX512VL-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX512VL-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX512VL-32-NEXT:    movl %ebp, %esp
+; AVX512VL-32-NEXT:    popl %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512VL-32-NEXT:    vzeroupper
+; AVX512VL-32-NEXT:    retl
+;
 ; AVX512VL-64-LABEL: sitofp_v4i64_v4f32:
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm0, %rax
@@ -1032,55 +1392,55 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 }
 
 define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
-; AVX-32-LABEL: uitofp_v4i64_v4f32:
-; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    pushl %ebp
-; AVX-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX-32-NEXT:    .cfi_offset %ebp, -8
-; AVX-32-NEXT:    movl %esp, %ebp
-; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
-; AVX-32-NEXT:    andl $-8, %esp
-; AVX-32-NEXT:    subl $48, %esp
-; AVX-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    vextractps $1, %xmm0, %eax
-; AVX-32-NEXT:    shrl $31, %eax
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT:    fstps (%esp)
-; AVX-32-NEXT:    wait
-; AVX-32-NEXT:    vextractps $3, %xmm0, %eax
-; AVX-32-NEXT:    shrl $31, %eax
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    wait
-; AVX-32-NEXT:    vextractps $1, %xmm1, %eax
-; AVX-32-NEXT:    shrl $31, %eax
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    wait
-; AVX-32-NEXT:    vextractps $3, %xmm1, %eax
-; AVX-32-NEXT:    shrl $31, %eax
-; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT:    wait
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; AVX-32-NEXT:    movl %ebp, %esp
-; AVX-32-NEXT:    popl %ebp
-; AVX-32-NEXT:    .cfi_def_cfa %esp, 4
-; AVX-32-NEXT:    vzeroupper
-; AVX-32-NEXT:    retl
+; AVX1-32-LABEL: uitofp_v4i64_v4f32:
+; AVX1-32:       # %bb.0:
+; AVX1-32-NEXT:    pushl %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-32-NEXT:    .cfi_offset %ebp, -8
+; AVX1-32-NEXT:    movl %esp, %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX1-32-NEXT:    andl $-8, %esp
+; AVX1-32-NEXT:    subl $48, %esp
+; AVX1-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX1-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    vextractps $1, %xmm0, %eax
+; AVX1-32-NEXT:    shrl $31, %eax
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT:    fstps (%esp)
+; AVX1-32-NEXT:    wait
+; AVX1-32-NEXT:    vextractps $3, %xmm0, %eax
+; AVX1-32-NEXT:    shrl $31, %eax
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    wait
+; AVX1-32-NEXT:    vextractps $1, %xmm1, %eax
+; AVX1-32-NEXT:    shrl $31, %eax
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    wait
+; AVX1-32-NEXT:    vextractps $3, %xmm1, %eax
+; AVX1-32-NEXT:    shrl $31, %eax
+; AVX1-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT:    wait
+; AVX1-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX1-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX1-32-NEXT:    movl %ebp, %esp
+; AVX1-32-NEXT:    popl %ebp
+; AVX1-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX1-32-NEXT:    vzeroupper
+; AVX1-32-NEXT:    retl
 ;
 ; AVX1-64-LABEL: uitofp_v4i64_v4f32:
 ; AVX1-64:       # %bb.0:
@@ -1109,6 +1469,56 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX1-64-NEXT:    vzeroupper
 ; AVX1-64-NEXT:    retq
 ;
+; AVX2-32-LABEL: uitofp_v4i64_v4f32:
+; AVX2-32:       # %bb.0:
+; AVX2-32-NEXT:    pushl %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-32-NEXT:    .cfi_offset %ebp, -8
+; AVX2-32-NEXT:    movl %esp, %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX2-32-NEXT:    andl $-8, %esp
+; AVX2-32-NEXT:    subl $48, %esp
+; AVX2-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX2-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    vextractps $1, %xmm0, %eax
+; AVX2-32-NEXT:    shrl $31, %eax
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT:    fstps (%esp)
+; AVX2-32-NEXT:    wait
+; AVX2-32-NEXT:    vextractps $3, %xmm0, %eax
+; AVX2-32-NEXT:    shrl $31, %eax
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    wait
+; AVX2-32-NEXT:    vextractps $1, %xmm1, %eax
+; AVX2-32-NEXT:    shrl $31, %eax
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    wait
+; AVX2-32-NEXT:    vextractps $3, %xmm1, %eax
+; AVX2-32-NEXT:    shrl $31, %eax
+; AVX2-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT:    wait
+; AVX2-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX2-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX2-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX2-32-NEXT:    movl %ebp, %esp
+; AVX2-32-NEXT:    popl %ebp
+; AVX2-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX2-32-NEXT:    vzeroupper
+; AVX2-32-NEXT:    retl
+;
 ; AVX2-64-LABEL: uitofp_v4i64_v4f32:
 ; AVX2-64:       # %bb.0:
 ; AVX2-64-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
@@ -1135,6 +1545,56 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX2-64-NEXT:    vzeroupper
 ; AVX2-64-NEXT:    retq
 ;
+; AVX512F-32-LABEL: uitofp_v4i64_v4f32:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    pushl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512F-32-NEXT:    movl %esp, %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT:    andl $-8, %esp
+; AVX512F-32-NEXT:    subl $48, %esp
+; AVX512F-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX512F-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vextractps $1, %xmm0, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT:    fstps (%esp)
+; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vextractps $3, %xmm0, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vextractps $1, %xmm1, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vextractps $3, %xmm1, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
+; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX512F-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX512F-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX512F-32-NEXT:    movl %ebp, %esp
+; AVX512F-32-NEXT:    popl %ebp
+; AVX512F-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT:    vzeroupper
+; AVX512F-32-NEXT:    retl
+;
 ; AVX512F-64-LABEL: uitofp_v4i64_v4f32:
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm0, %rax
@@ -1152,6 +1612,56 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX512F-64-NEXT:    vzeroupper
 ; AVX512F-64-NEXT:    retq
 ;
+; AVX512VL-32-LABEL: uitofp_v4i64_v4f32:
+; AVX512VL-32:       # %bb.0:
+; AVX512VL-32-NEXT:    pushl %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa_offset 8
+; AVX512VL-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512VL-32-NEXT:    movl %esp, %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa_register %ebp
+; AVX512VL-32-NEXT:    andl $-8, %esp
+; AVX512VL-32-NEXT:    subl $48, %esp
+; AVX512VL-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512VL-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX512VL-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vextractps $1, %xmm0, %eax
+; AVX512VL-32-NEXT:    shrl $31, %eax
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT:    fstps (%esp)
+; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vextractps $3, %xmm0, %eax
+; AVX512VL-32-NEXT:    shrl $31, %eax
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vextractps $1, %xmm1, %eax
+; AVX512VL-32-NEXT:    shrl $31, %eax
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vextractps $3, %xmm1, %eax
+; AVX512VL-32-NEXT:    shrl $31, %eax
+; AVX512VL-32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
+; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX512VL-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX512VL-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX512VL-32-NEXT:    movl %ebp, %esp
+; AVX512VL-32-NEXT:    popl %ebp
+; AVX512VL-32-NEXT:    .cfi_def_cfa %esp, 4
+; AVX512VL-32-NEXT:    vzeroupper
+; AVX512VL-32-NEXT:    retl
+;
 ; AVX512VL-64-LABEL: uitofp_v4i64_v4f32:
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm0, %rax
@@ -1189,3 +1699,5 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 }
 
 attributes #0 = { strictfp }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX-32: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
index 0cf945202a2d4..0cdd2821f92b3 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
@@ -271,21 +271,21 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    .cfi_def_cfa_register %ebp
 ; NODQ-32-NEXT:    andl $-8, %esp
 ; NODQ-32-NEXT:    subl $128, %esp
-; NODQ-32-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; NODQ-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    valignq {{.*#+}} zmm1 = zmm0[5,6,7,0,1,2,3,4]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    valignq {{.*#+}} zmm1 = zmm0[7,0,1,2,3,4,5,6]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7]
+; NODQ-32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
@@ -369,64 +369,64 @@ define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    .cfi_def_cfa_register %ebp
 ; NODQ-32-NEXT:    andl $-8, %esp
 ; NODQ-32-NEXT:    subl $128, %esp
-; NODQ-32-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
-; NODQ-32-NEXT:    vmovlps %xmm3, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
-; NODQ-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm4, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractps $1, %xmm3, %eax
+; NODQ-32-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NODQ-32-NEXT:    vmovq %xmm3, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    valignq {{.*#+}} zmm1 = zmm0[5,6,7,0,1,2,3,4]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-32-NEXT:    vmovq %xmm2, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    valignq {{.*#+}} zmm1 = zmm0[7,0,1,2,3,4,5,6]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpermq {{.*#+}} zmm4 = zmm0[3,3,3,3,7,7,7,7]
+; NODQ-32-NEXT:    vmovq %xmm4, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpextrd $1, %xmm3, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $3, %xmm3, %eax
+; NODQ-32-NEXT:    vpextrd $3, %xmm3, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $1, %xmm2, %eax
+; NODQ-32-NEXT:    vpextrd $1, %xmm2, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $3, %xmm2, %eax
+; NODQ-32-NEXT:    vpextrd $3, %xmm2, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $1, %xmm0, %eax
+; NODQ-32-NEXT:    vpextrd $1, %xmm0, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstpl (%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $3, %xmm0, %eax
+; NODQ-32-NEXT:    vpextrd $3, %xmm0, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $1, %xmm1, %eax
+; NODQ-32-NEXT:    vpextrd $1, %xmm1, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $3, %xmm1, %eax
+; NODQ-32-NEXT:    vpextrd $3, %xmm1, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
@@ -498,21 +498,21 @@ define <8 x float> @sitofp_v8i64_v8f32(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    .cfi_def_cfa_register %ebp
 ; NODQ-32-NEXT:    andl $-8, %esp
 ; NODQ-32-NEXT:    subl $96, %esp
-; NODQ-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
-; NODQ-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[3,3,3,3,7,7,7,7]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    valignq {{.*#+}} zmm1 = zmm0[5,6,7,0,1,2,3,4]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6]
+; NODQ-32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
@@ -594,64 +594,64 @@ define <8 x float> @uitofp_v8i64_v8f32(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    .cfi_def_cfa_register %ebp
 ; NODQ-32-NEXT:    andl $-8, %esp
 ; NODQ-32-NEXT:    subl $96, %esp
-; NODQ-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; NODQ-32-NEXT:    vmovlps %xmm3, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
-; NODQ-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm4, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractps $1, %xmm0, %eax
+; NODQ-32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NODQ-32-NEXT:    vmovq %xmm3, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[3,3,3,3,7,7,7,7]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-32-NEXT:    vmovq %xmm2, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    valignq {{.*#+}} zmm1 = zmm0[5,6,7,0,1,2,3,4]
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-32-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    valignq {{.*#+}} zmm4 = zmm0[7,0,1,2,3,4,5,6]
+; NODQ-32-NEXT:    vmovq %xmm4, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpextrd $1, %xmm0, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps (%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $3, %xmm0, %eax
+; NODQ-32-NEXT:    vpextrd $3, %xmm0, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $1, %xmm3, %eax
+; NODQ-32-NEXT:    vpextrd $1, %xmm3, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $3, %xmm3, %eax
+; NODQ-32-NEXT:    vpextrd $3, %xmm3, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $1, %xmm2, %eax
+; NODQ-32-NEXT:    vpextrd $1, %xmm2, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $3, %xmm2, %eax
+; NODQ-32-NEXT:    vpextrd $3, %xmm2, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $1, %xmm1, %eax
+; NODQ-32-NEXT:    vpextrd $1, %xmm1, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $3, %xmm1, %eax
+; NODQ-32-NEXT:    vpextrd $3, %xmm1, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index df2dc77dc1259..64acfd4b1b39f 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -161,11 +161,11 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-NEXT:    vcvttsd2si %xmm1, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX2-NEXT:    vcvttsd2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -179,11 +179,11 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512F-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT:    vcvttsd2si %xmm1, %rax
 ; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512F-NEXT:    vcvttsd2si %xmm2, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-NEXT:    vcvttsd2si %xmm0, %rax
 ; AVX512F-NEXT:    vmovq %rax, %xmm2
 ; AVX512F-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -197,11 +197,11 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512VL-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT:    vcvttsd2si %xmm1, %rax
 ; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512VL-NEXT:    vcvttsd2si %xmm2, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-NEXT:    vcvttsd2si %xmm0, %rax
 ; AVX512VL-NEXT:    vmovq %rax, %xmm2
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -606,17 +606,17 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 ; AVX2-NEXT:    sarq $63, %rdx
 ; AVX2-NEXT:    andq %rax, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm3
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX2-NEXT:    vsubsd %xmm1, %xmm2, %xmm4
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[3,2,2,3]
+; AVX2-NEXT:    vsubsd %xmm1, %xmm3, %xmm4
 ; AVX2-NEXT:    vcvttsd2si %xmm4, %rax
-; AVX2-NEXT:    vcvttsd2si %xmm2, %rcx
+; AVX2-NEXT:    vcvttsd2si %xmm3, %rcx
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
 ; AVX2-NEXT:    andq %rax, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vmovq %rdx, %xmm3
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm3
 ; AVX2-NEXT:    vcvttsd2si %xmm3, %rax
 ; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
@@ -642,11 +642,11 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512F-NEXT:    vcvttsd2usi %xmm1, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT:    vcvttsd2usi %xmm1, %rax
 ; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512F-NEXT:    vcvttsd2usi %xmm2, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-NEXT:    vcvttsd2usi %xmm0, %rax
 ; AVX512F-NEXT:    vmovq %rax, %xmm2
 ; AVX512F-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -660,11 +660,11 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512VL-NEXT:    vcvttsd2usi %xmm1, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT:    vcvttsd2usi %xmm1, %rax
 ; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512VL-NEXT:    vcvttsd2usi %xmm2, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-NEXT:    vcvttsd2usi %xmm0, %rax
 ; AVX512VL-NEXT:    vmovq %rax, %xmm2
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index e88387a8b7c69..3161ef4318c49 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -531,34 +531,35 @@ define <4 x double> @test_compress_v4f64(<4 x double> %vec, <4 x i1> %mask, <4 x
 ; AVX2-NEXT:    addl %eax, %ecx
 ; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vmovlpd %xmm0, (%rsp)
+; AVX2-NEXT:    vmovlps %xmm0, (%rsp)
 ; AVX2-NEXT:    vmovq %xmm3, %rax
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    movl %eax, %ecx
-; AVX2-NEXT:    vmovhpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vmovhps %xmm0, (%rsp,%rcx,8)
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
 ; AVX2-NEXT:    subq %rcx, %rax
 ; AVX2-NEXT:    movl %eax, %ecx
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovlpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vmovlps %xmm2, (%rsp,%rcx,8)
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
 ; AVX2-NEXT:    vmovq %xmm2, %rcx
 ; AVX2-NEXT:    subq %rcx, %rax
 ; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    vmovhpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT:    vmovlps %xmm0, (%rsp,%rcx,8)
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
 ; AVX2-NEXT:    subq %rcx, %rax
 ; AVX2-NEXT:    cmpq $4, %rax
-; AVX2-NEXT:    jb .LBB7_2
+; AVX2-NEXT:    jae .LBB7_2
 ; AVX2-NEXT:  # %bb.1:
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX2-NEXT:  .LBB7_2:
 ; AVX2-NEXT:    cmpq $3, %rax
 ; AVX2-NEXT:    movl $3, %ecx
 ; AVX2-NEXT:    cmovbq %rax, %rcx
 ; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    vmovsd %xmm1, (%rsp,%rax,8)
+; AVX2-NEXT:    vmovsd %xmm0, (%rsp,%rax,8)
 ; AVX2-NEXT:    vmovaps (%rsp), %ymm0
 ; AVX2-NEXT:    movq %rbp, %rsp
 ; AVX2-NEXT:    popq %rbp
@@ -1012,12 +1013,13 @@ define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x
 ; AVX2-NEXT:    vpextrw $1, %xmm2, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovlps %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vmovlps %xmm4, (%rsp,%rcx,8)
 ; AVX2-NEXT:    vpextrw $2, %xmm2, %eax
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    vmovhps %xmm0, (%rsp,%rax,8)
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT:    vmovlps %xmm0, (%rsp,%rax,8)
 ; AVX2-NEXT:    vpextrw $3, %xmm2, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
@@ -1026,36 +1028,37 @@ define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x
 ; AVX2-NEXT:    addq %rcx, %rax
 ; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $7, %ecx
-; AVX2-NEXT:    vmovlpd %xmm1, (%rsp,%rcx,8)
+; AVX2-NEXT:    vmovlps %xmm1, (%rsp,%rcx,8)
 ; AVX2-NEXT:    vpextrw $5, %xmm2, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vmovhpd %xmm1, (%rsp,%rax,8)
+; AVX2-NEXT:    vmovhps %xmm1, (%rsp,%rax,8)
 ; AVX2-NEXT:    vpextrw $6, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addq %rcx, %rdx
 ; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $7, %ecx
 ; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vmovlpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vmovlps %xmm0, (%rsp,%rcx,8)
 ; AVX2-NEXT:    vpextrw $7, %xmm2, %eax
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    addq %rdx, %rax
 ; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $7, %edx
-; AVX2-NEXT:    vmovhpd %xmm0, (%rsp,%rdx,8)
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm1[3,2,2,3]
+; AVX2-NEXT:    vmovlps %xmm0, (%rsp,%rdx,8)
 ; AVX2-NEXT:    cmpq $8, %rax
-; AVX2-NEXT:    jb .LBB11_2
+; AVX2-NEXT:    jae .LBB11_2
 ; AVX2-NEXT:  # %bb.1:
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX2-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX2-NEXT:  .LBB11_2:
 ; AVX2-NEXT:    cmpq $7, %rax
 ; AVX2-NEXT:    movl $7, %ecx
 ; AVX2-NEXT:    cmovbq %rax, %rcx
 ; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    vmovsd %xmm3, (%rsp,%rax,8)
+; AVX2-NEXT:    vmovsd %xmm0, (%rsp,%rax,8)
 ; AVX2-NEXT:    vmovaps (%rsp), %ymm0
 ; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
 ; AVX2-NEXT:    movq %rbp, %rsp
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index bdbe3c09e5782..c2142931f02b7 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3182,26 +3182,27 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
 ; AVX2-LABEL: cvt_4f64_to_4i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $88, %rsp
-; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -3245,27 +3246,28 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
 ; AVX512-NEXT:    subq $72, %rsp
 ; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = mem[1,0]
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
@@ -3314,26 +3316,27 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX2-LABEL: cvt_4f64_to_8i16_undef:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $88, %rsp
-; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -3377,27 +3380,28 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX512-NEXT:    subq $72, %rsp
 ; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = mem[1,0]
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
@@ -3447,26 +3451,27 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX2-LABEL: cvt_4f64_to_8i16_zero:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $88, %rsp
-; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -3510,27 +3515,28 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX512-NEXT:    subq $72, %rsp
 ; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = mem[1,0]
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vinsertps $28, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
 ; AVX512-NEXT:    addq $72, %rsp
 ; AVX512-NEXT:    retq
@@ -3541,63 +3547,123 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 }
 
 define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
-; AVX-LABEL: cvt_8f64_to_8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    subq $104, %rsp
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[1,0]
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[1,0]
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[1,0]
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[1,0]
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT:    addq $104, %rsp
-; AVX-NEXT:    retq
+; AVX1-LABEL: cvt_8f64_to_8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    subq $104, %rsp
+; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT:    addq $104, %rsp
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: cvt_8f64_to_8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    subq $104, %rsp
+; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT:    addq $104, %rsp
+; AVX2-NEXT:    retq
 ;
 ; F16C-LABEL: cvt_8f64_to_8i16:
 ; F16C:       # %bb.0:
@@ -3686,20 +3752,21 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = mem[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
 ; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
 ; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3708,8 +3775,8 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX512-NEXT:    addq $120, %rsp
@@ -3865,24 +3932,25 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    pushq %rbx
 ; AVX2-NEXT:    subq $80, %rsp
 ; AVX2-NEXT:    movq %rdi, %rbx
-; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vpextrw $0, %xmm0, 4(%rbx)
 ; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
@@ -3936,27 +4004,28 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind {
 ; AVX512-NEXT:    movq %rdi, %rbx
 ; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = mem[1,0]
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX512-NEXT:    vmovq %xmm0, (%rbx)
 ; AVX512-NEXT:    addq $64, %rsp
@@ -4009,26 +4078,27 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    pushq %rbx
 ; AVX2-NEXT:    subq $80, %rsp
 ; AVX2-NEXT:    movq %rdi, %rbx
-; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -4080,27 +4150,28 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
 ; AVX512-NEXT:    movq %rdi, %rbx
 ; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = mem[1,0]
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
@@ -4159,26 +4230,27 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    pushq %rbx
 ; AVX2-NEXT:    subq $80, %rsp
 ; AVX2-NEXT:    movq %rdi, %rbx
-; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -4230,27 +4302,28 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind {
 ; AVX512-NEXT:    movq %rdi, %rbx
 ; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = mem[1,0]
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vinsertps $28, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
 ; AVX512-NEXT:    vmovaps %xmm0, (%rbx)
 ; AVX512-NEXT:    addq $64, %rsp
@@ -4264,67 +4337,131 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind {
 }
 
 define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind {
-; AVX-LABEL: store_cvt_8f64_to_8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    subq $96, %rsp
-; AVX-NEXT:    movq %rdi, %rbx
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[1,0]
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[1,0]
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[1,0]
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[1,0]
-; AVX-NEXT:    callq __truncdfhf2 at PLT
-; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT:    vmovdqa %xmm0, (%rbx)
-; AVX-NEXT:    addq $96, %rsp
-; AVX-NEXT:    popq %rbx
-; AVX-NEXT:    retq
+; AVX1-LABEL: store_cvt_8f64_to_8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    subq $96, %rsp
+; AVX1-NEXT:    movq %rdi, %rbx
+; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
+; AVX1-NEXT:    callq __truncdfhf2 at PLT
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT:    vmovdqa %xmm0, (%rbx)
+; AVX1-NEXT:    addq $96, %rsp
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: store_cvt_8f64_to_8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $96, %rsp
+; AVX2-NEXT:    movq %rdi, %rbx
+; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    callq __truncdfhf2 at PLT
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT:    vmovdqa %xmm0, (%rbx)
+; AVX2-NEXT:    addq $96, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    retq
 ;
 ; F16C-LABEL: store_cvt_8f64_to_8i16:
 ; F16C:       # %bb.0:
@@ -4419,20 +4556,21 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind {
 ; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = mem[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
 ; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
 ; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4441,8 +4579,8 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind {
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rbx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index a39bc6b668669..9704bf59fdaa2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -845,43 +845,43 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i16_stride3_vf16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm5
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vpblendvb %ymm5, %ymm2, %ymm3, %ymm5
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm6
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX2-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vmovdqa %ymm4, (%rsi)
 ; AVX2-NEXT:    vmovdqa %ymm5, (%rdx)
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-NEXT:    vzeroupper
@@ -889,43 +889,43 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-FP-LABEL: load_i16_stride3_vf16:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm3
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm5
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm2, %ymm3, %ymm5
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm6
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm4, (%rsi)
 ; AVX2-FP-NEXT:    vmovdqa %ymm5, (%rdx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-FP-NEXT:    vzeroupper
@@ -933,43 +933,43 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-FCP-LABEL: load_i16_stride3_vf16:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm2, %ymm3, %ymm5
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm6
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-FCP-NEXT:    vzeroupper
@@ -977,176 +977,172 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-LABEL: load_i16_stride3_vf16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm4
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-NEXT:    vmovdqa %ymm2, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm3 & (ymm1 ^ ymm0))
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm5
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm6
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
 ; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7],ymm4[8,9,10],ymm7[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm7
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6],xmm7[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm0 ^ ymm1))
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512-NEXT:    vmovdqa %ymm3, (%rsi)
-; AVX512-NEXT:    vmovdqa %ymm6, (%rdx)
+; AVX512-NEXT:    vmovdqa %ymm4, (%rdx)
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride3_vf16:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm3 & (ymm1 ^ ymm0))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7],ymm4[8,9,10],ymm7[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6],xmm7[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm0 ^ ymm1))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride3_vf16:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm3 & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm5
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm6
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
-; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7],ymm4[8,9,10],ymm7[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm7
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6],xmm7[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa %ymm3, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %ymm6, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf16:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm3 & (ymm1 ^ ymm0))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7],ymm4[8,9,10],ymm7[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm7
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm0 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -1501,22 +1497,25 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm6[1],xmm14[2,3],xmm6[4],xmm14[5,6],xmm6[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
-; AVX-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7]
-; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6],xmm9[7]
-; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm12[0],xmm13[1,2],xmm12[3],xmm13[4,5],xmm12[6],xmm13[7]
-; AVX-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm13, %ymm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT:    vmovq {{.*#+}} xmm15 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm15, %xmm6, %xmm13
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm4[0],xmm14[1,2],xmm4[3],xmm14[4,5],xmm4[6],xmm14[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT:    vpshufb %xmm15, %xmm9, %xmm15
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5],xmm12[6],xmm15[7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm15, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm0
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
 ; AVX-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
@@ -1548,252 +1547,258 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i16_stride3_vf32:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm4
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT:    vpblendvb %ymm7, %ymm2, %ymm4, %ymm3
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-NEXT:    vpshufb %ymm8, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa 176(%rdi), %xmm5
-; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm6
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm7
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15]
-; AVX2-NEXT:    vpshufb %ymm8, %ymm7, %ymm9
-; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm7
-; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
-; AVX2-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm6
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-NEXT:    vpshufb %ymm8, %ymm6, %ymm9
+; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
 ; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm10
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3],xmm10[4],xmm6[5,6],xmm10[7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT:    vpblendvb %ymm7, %ymm4, %ymm5, %ymm7
+; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6],xmm9[7]
+; AVX2-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
+; AVX2-NEXT:    vpshufb %ymm8, %ymm7, %ymm7
+; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm8
+; AVX2-NEXT:    vmovdqa 176(%rdi), %xmm9
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX2-NEXT:    vpshufb %xmm11, %xmm12, %xmm11
+; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT:    vpblendvb %ymm11, %ymm4, %ymm2, %ymm10
-; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-NEXT:    vpshufb %ymm12, %ymm10, %ymm10
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
-; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
+; AVX2-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm10
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-NEXT:    vpshufb %ymm12, %ymm10, %ymm13
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
+; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7],ymm13[8,9,10],ymm14[11,12,13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm14
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
-; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15]
+; AVX2-NEXT:    vpblendvb %ymm11, %ymm5, %ymm4, %ymm11
+; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
+; AVX2-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX2-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
+; AVX2-NEXT:    vpshufb %xmm15, %xmm12, %xmm12
 ; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-NEXT:    vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-NEXT:    vpblendvb %ymm12, %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6],xmm9[7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5,6,7]
 ; AVX2-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovdqa %ymm3, 32(%rsi)
-; AVX2-NEXT:    vmovdqa %ymm9, (%rsi)
-; AVX2-NEXT:    vmovdqa %ymm10, 32(%rdx)
-; AVX2-NEXT:    vmovdqa %ymm11, (%rdx)
-; AVX2-NEXT:    vmovdqa %ymm2, 32(%rcx)
+; AVX2-NEXT:    vmovdqa %ymm7, 32(%rsi)
+; AVX2-NEXT:    vmovdqa %ymm6, (%rsi)
+; AVX2-NEXT:    vmovdqa %ymm11, 32(%rdx)
+; AVX2-NEXT:    vmovdqa %ymm10, (%rdx)
+; AVX2-NEXT:    vmovdqa %ymm4, 32(%rcx)
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride3_vf32:
 ; AVX2-FP:       # %bb.0:
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm2, %ymm4, %ymm3
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm3, %ymm3
-; AVX2-FP-NEXT:    vmovdqa 176(%rdi), %xmm5
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm7
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15]
-; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm7, %ymm9
-; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm7
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm6
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm6, %ymm9
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3],xmm10[4],xmm6[5,6],xmm10[7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm5, %ymm7
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6],xmm9[7]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
+; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm7, %ymm7
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm8
+; AVX2-FP-NEXT:    vmovdqa 176(%rdi), %xmm9
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm12, %xmm11
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm4, %ymm2, %ymm10
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm10, %ymm10
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
+; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm10
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm10, %ymm13
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7],ymm13[8,9,10],ymm14[11,12,13,14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm14
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15]
+; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm5, %ymm4, %ymm11
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm12, %xmm12
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm5, %ymm4, %ymm4
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6],xmm9[7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5,6,7]
 ; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
+; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm3, 32(%rsi)
-; AVX2-FP-NEXT:    vmovdqa %ymm9, (%rsi)
-; AVX2-FP-NEXT:    vmovdqa %ymm10, 32(%rdx)
-; AVX2-FP-NEXT:    vmovdqa %ymm11, (%rdx)
-; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%rcx)
+; AVX2-FP-NEXT:    vmovdqa %ymm7, 32(%rsi)
+; AVX2-FP-NEXT:    vmovdqa %ymm6, (%rsi)
+; AVX2-FP-NEXT:    vmovdqa %ymm11, 32(%rdx)
+; AVX2-FP-NEXT:    vmovdqa %ymm10, (%rdx)
+; AVX2-FP-NEXT:    vmovdqa %ymm4, 32(%rcx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride3_vf32:
 ; AVX2-FCP:       # %bb.0:
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm2, %ymm4, %ymm3
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vmovdqa 176(%rdi), %xmm5
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm6
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm7
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15]
-; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm7, %ymm9
-; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm6
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm9
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm10
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3],xmm10[4],xmm6[5,6],xmm10[7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm5, %ymm7
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6],xmm9[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
+; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm8
+; AVX2-FCP-NEXT:    vmovdqa 176(%rdi), %xmm9
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm11
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm4, %ymm2, %ymm10
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm10, %ymm10
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
+; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm10
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm10, %ymm13
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7],ymm13[8,9,10],ymm14[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm14
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15]
+; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm5, %ymm4, %ymm11
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm12, %xmm12
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm5, %ymm4, %ymm4
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6],xmm9[7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovdqa %ymm9, (%rsi)
-; AVX2-FCP-NEXT:    vmovdqa %ymm10, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm11, (%rdx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm7, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovdqa %ymm6, (%rsi)
+; AVX2-FCP-NEXT:    vmovdqa %ymm11, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm10, (%rdx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm4, 32(%rcx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
@@ -1813,55 +1818,54 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm9
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm8
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512-NEXT:    vmovdqa %ymm0, %ymm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm8 ^ (ymm10 & (ymm9 ^ ymm8))
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm12
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6],xmm12[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm10, %zmm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm10, %zmm16
 ; AVX512-NEXT:    vmovdqa %ymm0, %ymm10
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6))
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
 ; AVX512-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vmovdqa %ymm12, %ymm13
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm14, %xmm11
-; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm10[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vmovdqa %ymm13, %ymm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm8 ^ ymm9))
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm14[0,1],xmm12[2],xmm14[3,4],xmm12[5],xmm14[6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm6 ^ ymm5))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7],ymm5[8],ymm13[9,10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm9 ^ (ymm0 & (ymm8 ^ ymm9))
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
@@ -1870,8 +1874,8 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa64 %zmm7, (%rsi)
-; AVX512-NEXT:    vmovdqa64 %zmm10, (%rdx)
+; AVX512-NEXT:    vmovdqa64 %zmm16, (%rsi)
+; AVX512-NEXT:    vmovdqa64 %zmm7, (%rdx)
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1879,77 +1883,78 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-LABEL: load_i16_stride3_vf32:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm5 ^ (ymm1 & (ymm6 ^ ymm5))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-FCP-NEXT:    vmovdqa 112(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm1 ^ (ymm6 & (ymm2 ^ ymm1))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm10, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm10
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm13
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm14, %xmm11
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm7[3,4,5,6,7],ymm5[8,9,10],ymm7[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm10
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm9 ^ (ymm6 & (ymm10 ^ ymm9))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [24,25,26,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 112(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512-FCP-NEXT:    vpermi2d %zmm13, %zmm11, %zmm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm8, %zmm16
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm9 ^ ymm10))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm11[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm1 ^ ymm2))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2],xmm14[3,4],xmm5[5],xmm14[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm9 ^ (ymm13 & (ymm10 ^ ymm9))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm13[1,2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7],ymm8[8],ymm13[9,10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -1969,55 +1974,54 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm9
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm8
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm8 ^ (ymm10 & (ymm9 ^ ymm8))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm12
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6],xmm12[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm10, %zmm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm10, %zmm16
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm10
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6))
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
 ; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm13
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm14, %xmm11
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm10[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm8 ^ ymm9))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm14[0,1],xmm12[2],xmm14[3,4],xmm12[5],xmm14[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm6 ^ ymm5))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7],ymm5[8],ymm13[9,10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm9 ^ (ymm0 & (ymm8 ^ ymm9))
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
@@ -2026,8 +2030,8 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm10, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%rsi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -2035,77 +2039,78 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm5 ^ (ymm1 & (ymm6 ^ ymm5))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm1 ^ (ymm6 & (ymm2 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm10, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm13
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm14, %xmm11
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm7[3,4,5,6,7],ymm5[8,9,10],ymm7[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm9 ^ (ymm6 & (ymm10 ^ ymm9))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [24,25,26,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm7
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm13, %zmm11, %zmm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm8, %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm9 ^ ymm10))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm11[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm1 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2],xmm14[3,4],xmm5[5],xmm14[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm9 ^ (ymm13 & (ymm10 ^ ymm9))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm13[1,2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7],ymm8[8],ymm13[9,10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -2693,896 +2698,939 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i16_stride3_vf64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $408, %rsp # imm = 0x198
-; AVX-NEXT:    vmovdqa 176(%rdi), %xmm6
-; AVX-NEXT:    vmovdqa 160(%rdi), %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX-NEXT:    subq $488, %rsp # imm = 0x1E8
+; AVX-NEXT:    vmovdqa 176(%rdi), %xmm5
+; AVX-NEXT:    vmovdqa 160(%rdi), %xmm13
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm5[2],xmm13[3,4],xmm5[5],xmm13[6,7]
+; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa 144(%rdi), %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa 144(%rdi), %xmm7
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa 112(%rdi), %xmm10
-; AVX-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6],xmm10[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX-NEXT:    vmovdqa 128(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
+; AVX-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa 128(%rdi), %xmm4
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 368(%rdi), %xmm14
-; AVX-NEXT:    vmovdqa 352(%rdi), %xmm2
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 368(%rdi), %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm14[2],xmm2[3,4],xmm14[5],xmm2[6,7]
-; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 352(%rdi), %xmm15
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm15[0,1],xmm2[2],xmm15[3,4],xmm2[5],xmm15[6,7]
+; AVX-NEXT:    vmovdqa %xmm15, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa 336(%rdi), %xmm9
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm9[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa 336(%rdi), %xmm3
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa 304(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 304(%rdi), %xmm4
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 288(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
-; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa 320(%rdi), %xmm4
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 272(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 272(%rdi), %xmm3
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 256(%rdi), %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa 240(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa 240(%rdi), %xmm3
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa 208(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 192(%rdi), %xmm3
+; AVX-NEXT:    vmovdqa 208(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
-; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX-NEXT:    vmovdqa 224(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
+; AVX-NEXT:    vmovdqa 192(%rdi), %xmm4
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa 224(%rdi), %xmm4
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 80(%rdi), %xmm13
-; AVX-NEXT:    vmovdqa 64(%rdi), %xmm11
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
-; AVX-NEXT:    vmovdqa %xmm11, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm6
+; AVX-NEXT:    vmovdqa 64(%rdi), %xmm10
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm6[2],xmm10[3,4],xmm6[5],xmm10[6,7]
+; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa 48(%rdi), %xmm4
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa (%rdi), %xmm15
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0],xmm4[1],xmm15[2,3],xmm4[4],xmm15[5,6],xmm4[7]
-; AVX-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm0
-; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,1,2,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm1[3,4,5,6,7]
+; AVX-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb %xmm0, %xmm11, %xmm11
+; AVX-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,1,2,1]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm12[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm11, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm13[2],xmm5[3,4],xmm13[5],xmm5[6,7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT:    vpshufb %xmm5, %xmm11, %xmm11
+; AVX-NEXT:    vmovq {{.*#+}} xmm9 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm9, %xmm7, %xmm13
+; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6],xmm8[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm10[2],xmm7[3,4],xmm10[5],xmm7[6,7]
-; AVX-NEXT:    vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = mem[0],xmm12[1,2],mem[3],xmm12[4,5],mem[6],xmm12[7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = xmm14[0,1],mem[2],xmm14[3,4],mem[5],xmm14[6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6],xmm9[7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm12[0,1],mem[2],xmm12[3,4],mem[5],xmm12[6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1,2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm1[0],xmm12[1,2],xmm1[3],xmm12[4,5],xmm1[6],xmm12[7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7]
+; AVX-NEXT:    vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm13 = mem[0],xmm13[1,2],mem[3],xmm13[4,5],mem[6],xmm13[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT:    vpshufb %xmm0, %xmm13, %xmm13
+; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm13, %ymm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm2[0,1],xmm15[2],xmm2[3,4],xmm15[5],xmm2[6,7]
+; AVX-NEXT:    vpshufb %xmm5, %xmm11, %xmm11
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm9, %xmm5, %xmm13
+; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3],xmm15[4],xmm0[5,6],xmm15[7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT:    vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = mem[0,1],xmm12[2],mem[3,4],xmm12[5],mem[6,7]
-; AVX-NEXT:    vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = mem[0],xmm12[1,2],mem[3],xmm12[4,5],mem[6],xmm12[7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm12[2],xmm15[3,4],xmm12[5],xmm15[6,7]
+; AVX-NEXT:    vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm13 = mem[0],xmm13[1,2],mem[3],xmm13[4,5],mem[6],xmm13[7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm13, %xmm13
+; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm13, %ymm11
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT:    vpshufb %xmm9, %xmm4, %xmm13
+; AVX-NEXT:    vmovdqa %xmm9, %xmm4
+; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1,2],xmm1[3],xmm13[4,5],xmm1[6],xmm13[7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm13, %xmm13
+; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm13, %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm4, %xmm10, %xmm11
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = xmm11[0,1],mem[2],xmm11[3,4],mem[5],xmm11[6,7]
+; AVX-NEXT:    vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = mem[0],xmm11[1,2],mem[3],xmm11[4,5],mem[6],xmm11[7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm8[2,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm10[0,1],xmm7[2],xmm10[3,4],xmm7[5],xmm10[6,7]
-; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm12[0],mem[1],xmm12[2,3],mem[4],xmm12[5,6],mem[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX-NEXT:    vpshufb %xmm1, %xmm12, %xmm12
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm8
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0],xmm0[1],mem[2,3],xmm0[4],mem[5,6],xmm0[7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[2,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm10 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm10 = mem[0,1],xmm5[2],mem[3,4],xmm5[5],mem[6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6],xmm14[7]
-; AVX-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm9, %ymm0
-; AVX-NEXT:    vmovdqa (%rsp), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = xmm5[0],mem[1],xmm5[2,3],mem[4],xmm5[5,6],mem[7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
-; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm4[0,1],mem[2],xmm4[3,4],mem[5],xmm4[6,7]
-; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7]
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm3, %ymm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm15[2,1,2,3]
+; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3],mem[4],xmm0[5,6],mem[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm7[2,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
+; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3],mem[4],xmm9[5,6],mem[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm9, %ymm9
+; AVX-NEXT:    vpblendw $109, (%rsp), %xmm2, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[0],xmm2[1],mem[2,3],xmm2[4],mem[5,6],xmm2[7]
+; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm5[2,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm12[0,1],xmm15[2],xmm12[3,4],xmm15[5],xmm12[6,7]
+; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm8[0],mem[1],xmm8[2,3],mem[4],xmm8[5,6],mem[7]
+; AVX-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = xmm2[0],mem[1],xmm2[2,3],mem[4],xmm2[5,6],mem[7]
+; AVX-NEXT:    vmovdqa %xmm4, %xmm7
+; AVX-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
+; AVX-NEXT:    vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[2,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm4[0,1],mem[2],xmm4[3,4],mem[5],xmm4[6,7]
-; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7]
-; AVX-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm2, (%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm2, 64(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm2, 96(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm2, 32(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm2, 64(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm2, (%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm2, 96(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm2, 32(%rdx)
-; AVX-NEXT:    vmovaps %ymm1, 64(%rcx)
-; AVX-NEXT:    vmovaps %ymm3, (%rcx)
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm2[0,1],mem[2],xmm2[3,4],mem[5],xmm2[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6],xmm3[7]
+; AVX-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[2,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = mem[0,1],xmm3[2],mem[3,4],xmm3[5],mem[6,7]
+; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3],mem[4],xmm3[5,6],mem[7]
+; AVX-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, (%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 64(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 96(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 64(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, (%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 96(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%rdx)
+; AVX-NEXT:    vmovaps %ymm2, 64(%rcx)
+; AVX-NEXT:    vmovaps %ymm1, (%rcx)
 ; AVX-NEXT:    vmovaps %ymm0, 96(%rcx)
-; AVX-NEXT:    vmovaps %ymm8, 32(%rcx)
-; AVX-NEXT:    addq $408, %rsp # imm = 0x198
+; AVX-NEXT:    vmovaps %ymm9, 32(%rcx)
+; AVX-NEXT:    addq $488, %rsp # imm = 0x1E8
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: load_i16_stride3_vf64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $136, %rsp
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX2-NEXT:    subq $200, %rsp
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm4
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm5
-; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm9
-; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm10
+; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm10
+; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm11
 ; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm12
 ; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm13
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT:    vpblendvb %ymm11, %ymm12, %ymm13, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-NEXT:    vpblendvb %ymm11, %ymm9, %ymm10, %ymm3
-; AVX2-NEXT:    vpblendvb %ymm11, %ymm4, %ymm5, %ymm8
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm13, %ymm12, %ymm6
-; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm12, %ymm13, %ymm8
+; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2,3],xmm2[4],xmm8[5,6],xmm2[7]
+; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm10, %ymm11, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm4, %ymm5, %ymm6
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-NEXT:    vpblendvb %ymm14, %ymm13, %ymm12, %ymm7
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-NEXT:    vpblendvb %ymm7, %ymm13, %ymm12, %ymm6
-; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm10, %ymm9, %ymm13
-; AVX2-NEXT:    vpblendvb %ymm7, %ymm10, %ymm9, %ymm6
-; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm5, %ymm4, %ymm10
+; AVX2-NEXT:    vpblendvb %ymm7, %ymm13, %ymm12, %ymm9
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendvb %ymm14, %ymm11, %ymm10, %ymm13
+; AVX2-NEXT:    vpblendvb %ymm7, %ymm11, %ymm10, %ymm9
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendvb %ymm14, %ymm5, %ymm4, %ymm9
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 176(%rdi), %xmm5
-; AVX2-NEXT:    vpblendvb %ymm11, %ymm1, %ymm2, %ymm11
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm14
-; AVX2-NEXT:    vpblendvb %ymm7, %ymm2, %ymm1, %ymm6
-; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm7
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa 368(%rdi), %xmm15
-; AVX2-NEXT:    vmovdqa 352(%rdi), %xmm4
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15]
-; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
-; AVX2-NEXT:    vmovdqa 272(%rdi), %xmm8
-; AVX2-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
-; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm12
-; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm11
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm8, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm14, %ymm1, %ymm0, %ymm15
+; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm12, %xmm12
-; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
-; AVX2-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm13, %xmm13
-; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-NEXT:    vpshufb %ymm9, %ymm10, %ymm10
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm11
+; AVX2-NEXT:    vmovdqa 176(%rdi), %xmm9
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa 352(%rdi), %xmm8
+; AVX2-NEXT:    vmovdqa 368(%rdi), %xmm14
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm6, %ymm5
+; AVX2-NEXT:    vmovdqa 256(%rdi), %xmm7
+; AVX2-NEXT:    vmovdqa 272(%rdi), %xmm6
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm12, %xmm12
 ; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
-; AVX2-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
+; AVX2-NEXT:    vpshufb %ymm10, %ymm3, %ymm10
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-NEXT:    vmovdqu (%rsp), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm10
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6],xmm10[7]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
+; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm13, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3,4,5,6,7],ymm13[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm10, %ymm15, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm15, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-NEXT:    vextracti128 $1, %ymm15, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-NEXT:    vpshufb %ymm10, %ymm15, %ymm10
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm4, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm4, 64(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm4, 96(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm4, 32(%rsi)
-; AVX2-NEXT:    vmovdqa %ymm10, 64(%rdx)
-; AVX2-NEXT:    vmovdqa %ymm14, (%rdx)
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7],ymm10[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm10[2,3,0,1]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm12[2,3,0,1]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12,13],ymm3[14],ymm12[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm12[2,3,0,1]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm8, %ymm7
+; AVX2-NEXT:    vpshufb %xmm11, %xmm6, %xmm6
+; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm8[2,3,0,1]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX2-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 64(%rsi)
+; AVX2-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 96(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 32(%rsi)
+; AVX2-NEXT:    vmovdqa %ymm2, 64(%rdx)
+; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX2-NEXT:    vmovdqa %ymm13, 96(%rdx)
-; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdx)
-; AVX2-NEXT:    vmovdqa %ymm3, 64(%rcx)
-; AVX2-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX2-NEXT:    vmovdqa %ymm1, 96(%rcx)
-; AVX2-NEXT:    vmovdqa %ymm5, 32(%rcx)
-; AVX2-NEXT:    addq $136, %rsp
+; AVX2-NEXT:    vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT:    vmovdqa %ymm6, 64(%rcx)
+; AVX2-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX2-NEXT:    vmovdqa %ymm3, 96(%rcx)
+; AVX2-NEXT:    vmovdqa %ymm9, 32(%rcx)
+; AVX2-NEXT:    addq $200, %rsp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride3_vf64:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $136, %rsp
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX2-FP-NEXT:    subq $200, %rsp
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm5
-; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm11
 ; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm12
 ; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm13
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm12, %ymm13, %ymm0
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm9, %ymm10, %ymm3
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm4, %ymm5, %ymm8
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm13, %ymm12, %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm12, %ymm13, %ymm8
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2,3],xmm2[4],xmm8[5,6],xmm2[7]
+; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm10, %ymm11, %ymm2
+; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm4, %ymm5, %ymm6
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm13, %ymm12, %ymm7
+; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm13, %ymm12, %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm10, %ymm9, %ymm13
-; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm10, %ymm9, %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm5, %ymm4, %ymm10
+; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm13, %ymm12, %ymm9
+; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm11, %ymm10, %ymm13
+; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm11, %ymm10, %ymm9
+; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm5, %ymm4, %ymm9
+; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX2-FP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 176(%rdi), %xmm5
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm2, %ymm11
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm14
-; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm2, %ymm1, %ymm6
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm7
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vmovdqa 368(%rdi), %xmm15
-; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %xmm4
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15]
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm2
-; AVX2-FP-NEXT:    vmovdqa 272(%rdi), %xmm8
-; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15]
-; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm1, %ymm12
-; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm11
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm8, %ymm4
+; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm3
+; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm1, %ymm0, %ymm15
+; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm12, %xmm12
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
-; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm13, %xmm13
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm10, %ymm10
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm11
+; AVX2-FP-NEXT:    vmovdqa 176(%rdi), %xmm9
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
+; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %xmm8
+; AVX2-FP-NEXT:    vmovdqa 368(%rdi), %xmm14
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7]
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm6, %ymm5
+; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %xmm7
+; AVX2-FP-NEXT:    vmovdqa 272(%rdi), %xmm6
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm12, %xmm12
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
-; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm3, %ymm10
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6],xmm10[7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm13, %xmm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3,4,5,6,7],ymm13[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm15, %ymm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm15, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm15, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm15, %ymm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm4, 64(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rsi)
-; AVX2-FP-NEXT:    vmovdqa %ymm10, 64(%rdx)
-; AVX2-FP-NEXT:    vmovdqa %ymm14, (%rdx)
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7],ymm10[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm10[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm12[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12,13],ymm3[14],ymm12[15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7]
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm12[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm8, %ymm7
+; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm6, %xmm6
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm8[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rsi)
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rsi)
+; AVX2-FP-NEXT:    vmovdqa %ymm2, 64(%rdx)
+; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm13, 96(%rdx)
-; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%rdx)
-; AVX2-FP-NEXT:    vmovdqa %ymm3, 64(%rcx)
-; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX2-FP-NEXT:    vmovdqa %ymm1, 96(%rcx)
-; AVX2-FP-NEXT:    vmovdqa %ymm5, 32(%rcx)
-; AVX2-FP-NEXT:    addq $136, %rsp
+; AVX2-FP-NEXT:    vmovdqa %ymm1, 32(%rdx)
+; AVX2-FP-NEXT:    vmovdqa %ymm6, 64(%rcx)
+; AVX2-FP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX2-FP-NEXT:    vmovdqa %ymm3, 96(%rcx)
+; AVX2-FP-NEXT:    vmovdqa %ymm9, 32(%rcx)
+; AVX2-FP-NEXT:    addq $200, %rsp
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride3_vf64:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $136, %rsp
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX2-FCP-NEXT:    subq $200, %rsp
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm11
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm12
 ; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm12, %ymm13, %ymm0
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm9, %ymm10, %ymm3
-; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm4, %ymm5, %ymm8
-; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm13, %ymm12, %ymm6
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm12, %ymm13, %ymm8
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2,3],xmm2[4],xmm8[5,6],xmm2[7]
+; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm10, %ymm11, %ymm2
+; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm4, %ymm5, %ymm6
+; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm13, %ymm12, %ymm7
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm13, %ymm12, %ymm6
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm10, %ymm9, %ymm13
-; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm10, %ymm9, %ymm6
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm5, %ymm4, %ymm10
+; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm13, %ymm12, %ymm9
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm11, %ymm10, %ymm13
+; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm11, %ymm10, %ymm9
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm5, %ymm4, %ymm9
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 176(%rdi), %xmm5
-; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm2, %ymm11
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm14
-; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm2, %ymm1, %ymm6
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 368(%rdi), %xmm15
-; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm4
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15]
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 272(%rdi), %xmm8
-; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15]
-; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm12
-; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm4
+; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm3
+; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm1, %ymm0, %ymm15
+; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm12, %xmm12
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
-; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm13
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm10, %ymm10
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm11
+; AVX2-FCP-NEXT:    vmovdqa 176(%rdi), %xmm9
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm8
+; AVX2-FCP-NEXT:    vmovdqa 368(%rdi), %xmm14
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm5
+; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %xmm7
+; AVX2-FCP-NEXT:    vmovdqa 272(%rdi), %xmm6
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm12, %xmm12
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
-; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm10
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm10
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6],xmm10[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3,4,5,6,7],ymm13[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm10
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovdqa %ymm10, 64(%rdx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm14, (%rdx)
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7],ymm10[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm10[2,3,0,1]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm12[2,3,0,1]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12,13],ymm3[14],ymm12[15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm12[2,3,0,1]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm7
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm6
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm8[2,3,0,1]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rsi)
+; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, 64(%rdx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm13, 96(%rdx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm0, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, 64(%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, 96(%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, 32(%rcx)
-; AVX2-FCP-NEXT:    addq $136, %rsp
+; AVX2-FCP-NEXT:    vmovdqa %ymm1, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm6, 64(%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, 96(%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm9, 32(%rcx)
+; AVX2-FCP-NEXT:    addq $200, %rsp
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i16_stride3_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-NEXT:    vmovdqa64 224(%rdi), %ymm18
-; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm20
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-NEXT:    vmovdqa64 320(%rdi), %ymm18
+; AVX512-NEXT:    vmovdqa64 352(%rdi), %ymm21
+; AVX512-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm21 ^ ymm18))
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm2, %ymm5
-; AVX512-NEXT:    vmovdqa 272(%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm19
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512-NEXT:    vpshufb %xmm13, %xmm6, %xmm6
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 320(%rdi), %ymm21
-; AVX512-NEXT:    vmovdqa64 352(%rdi), %ymm22
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm8, %ymm11
-; AVX512-NEXT:    vmovdqa 304(%rdi), %xmm8
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa 304(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa 288(%rdi), %xmm4
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm16
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm11
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm5
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm5, %ymm10
-; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm15
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm5
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm24
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm10
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm1, %ymm7
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm10
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
-; AVX512-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm17
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm25
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm26
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm25
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm26
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 224(%rdi), %ymm22
+; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm23
+; AVX512-NEXT:    vmovdqa %ymm5, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm23 ^ ymm22))
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX512-NEXT:    vpshufb %ymm6, %ymm2, %ymm9
+; AVX512-NEXT:    vmovdqa 272(%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 256(%rdi), %xmm4
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm28
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm11
+; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm0
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm29
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm9, %zmm16
+; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm24
+; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm12
+; AVX512-NEXT:    vmovdqa %ymm5, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm12 ^ ymm24))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6],ymm9[7],ymm3[8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13,14],ymm9[15]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm7
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0],xmm7[1],xmm14[2,3],xmm7[4],xmm14[5,6],xmm7[7]
+; AVX512-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm15
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm10
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512-NEXT:    vmovdqa %ymm5, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm19 ^ ymm1))
+; AVX512-NEXT:    vpshufb %ymm6, %ymm0, %ymm6
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm4
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm17
+; AVX512-NEXT:    vmovdqa %ymm5, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm24 ^ ymm12))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm30
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512-NEXT:    vmovdqa %ymm13, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm8
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm27
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm14
-; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm19
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa %ymm13, %ymm6
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm6, %ymm4
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm1 ^ ymm19))
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm2, %ymm11
+; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm31
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm20
+; AVX512-NEXT:    vmovdqa %ymm5, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm18 ^ ymm21))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm7
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
+; AVX512-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm9
+; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm8
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa %ymm13, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm22 ^ ymm23))
+; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm6
+; AVX512-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm6
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm13 & (ymm12 ^ ymm24))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm12[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7],ymm2[8],ymm12[9,10],ymm2[11],ymm12[12,13],ymm2[14],ymm12[15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm3
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2],xmm3[3,4],xmm14[5],xmm3[6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm5 & (ymm1 ^ ymm19))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7],ymm3[8],ymm1[9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6],xmm15[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm7
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
-; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm18 ^ (ymm13 & (ymm21 ^ ymm18))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm22 ^ ymm23))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
 ; AVX512-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm4
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7]
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512-NEXT:    vmovdqa64 %zmm17, (%rsi)
 ; AVX512-NEXT:    vmovdqa64 %zmm16, 64(%rsi)
-; AVX512-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
-; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdx)
-; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512-NEXT:    vmovdqa64 %zmm20, (%rdx)
+; AVX512-NEXT:    vmovdqa64 %zmm2, 64(%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rcx)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -3590,162 +3638,169 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-LABEL: load_i16_stride3_vf64:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm18
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm20
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 272(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm19
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm21
-; AVX512-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm22
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa 304(%rdi), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm20
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm21
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm21 ^ ymm20))
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 272(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm25
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm26
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa 208(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm29
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm22
+; AVX512-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm23
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm23 ^ ymm22))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7],ymm2[8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13,14],ymm8[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm11
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [24,25,26,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 304(%rdi), %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm16
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm11
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa 112(%rdi), %xmm15
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm13
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm16, %zmm11
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm3, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm11
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm24
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm10
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm10
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm24 ^ ymm11))
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm15
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm8[2],xmm15[3,4],xmm8[5],xmm15[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6],xmm6[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm18
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm5 ^ ymm18))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7],ymm1[8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14],ymm4[15]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 112(%rdi), %xmm12
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm1[0],xmm12[1],xmm1[2,3],xmm12[4],xmm1[5,6],xmm12[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm19
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm8, %xmm25
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm26
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm5 ^ (ymm2 & (ymm18 ^ ymm5))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm12[2],xmm1[3,4],xmm12[5],xmm1[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm24 ^ (ymm2 & (ymm11 ^ ymm24))
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm15[2],xmm8[3,4],xmm15[5],xmm8[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm8, %xmm30
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm14, %xmm14
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm13, %xmm13
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm13[3,4,5,6,7],ymm9[8,9,10],ymm13[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm16
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm23 ^ (ymm2 & (ymm22 ^ ymm23))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm13
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm13[0,1],xmm9[2],xmm13[3,4],xmm9[5],xmm13[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm21 ^ (ymm4 & (ymm20 ^ ymm21))
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm8
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm7 & (ymm5 ^ ymm18))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm12[0,1],xmm1[2],xmm12[3,4],xmm1[5],xmm12[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm24 ^ (ymm0 & (ymm11 ^ ymm24))
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm6
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0],xmm6[1],xmm15[2,3],xmm6[4],xmm15[5,6],xmm6[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm22 ^ (ymm7 & (ymm23 ^ ymm22))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm7[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7],ymm3[8],ymm7[9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm13[2],xmm9[3,4],xmm13[5],xmm9[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm20 ^ ymm21))
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm14[0],xmm10[1],xmm14[2,3],xmm10[4],xmm14[5,6],xmm10[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rcx)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -3753,164 +3808,170 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-LABEL: load_i16_stride3_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-NEXT:    vmovdqa64 224(%rdi), %ymm18
-; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm20
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %ymm18
+; AVX512DQ-NEXT:    vmovdqa64 352(%rdi), %ymm21
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm21 ^ ymm18))
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm2, %ymm5
-; AVX512DQ-NEXT:    vmovdqa 272(%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm19
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %ymm21
-; AVX512DQ-NEXT:    vmovdqa64 352(%rdi), %ymm22
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm8, %ymm11
-; AVX512DQ-NEXT:    vmovdqa 304(%rdi), %xmm8
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa 304(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm16
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm11
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm5, %ymm10
-; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm15
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %ymm24
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm1, %ymm7
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm10
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm17
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm25
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm26
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm25
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm26
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 224(%rdi), %ymm22
+; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm23
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm23 ^ ymm22))
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm2, %ymm9
+; AVX512DQ-NEXT:    vmovdqa 272(%rdi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %xmm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm28
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm11, %xmm11
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm29
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm9, %zmm16
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm24
+; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm12
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm12 ^ ymm24))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6],ymm9[7],ymm3[8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13,14],ymm9[15]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0],xmm7[1],xmm14[2,3],xmm7[4],xmm14[5,6],xmm7[7]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm15
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm10
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm19 ^ ymm1))
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm17
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm24 ^ ymm12))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm30
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm8
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm27
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm14
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm19
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm6
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm6, %ymm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm1 ^ ymm19))
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm2, %ymm11
+; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm31
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm20
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm18 ^ ymm21))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm9
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm22 ^ ymm23))
+; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm6
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm6
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm13 & (ymm12 ^ ymm24))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm12[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7],ymm2[8],ymm12[9,10],ymm2[11],ymm12[12,13],ymm2[14],ymm12[15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2],xmm3[3,4],xmm14[5],xmm3[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm5 & (ymm1 ^ ymm19))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7],ymm3[8],ymm1[9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6],xmm15[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm7
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm18 ^ (ymm13 & (ymm21 ^ ymm18))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm22 ^ ymm23))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
 ; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rsi)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 64(%rsi)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -3918,162 +3979,169 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 272(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm21
-; AVX512DQ-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm22
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 304(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm21 ^ ymm20))
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 272(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 208(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm23
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm23 ^ ymm22))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7],ymm2[8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13,14],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm11
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [24,25,26,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 304(%rdi), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rdi), %xmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm16, %zmm11
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm3, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %ymm24
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm24 ^ ymm11))
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm15
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm8[2],xmm15[3,4],xmm8[5],xmm15[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm18
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm5 ^ ymm18))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7],ymm1[8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14],ymm4[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm1[0],xmm12[1],xmm1[2,3],xmm12[4],xmm1[5,6],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm19
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm8, %xmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm5 ^ (ymm2 & (ymm18 ^ ymm5))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm12[2],xmm1[3,4],xmm12[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm24 ^ (ymm2 & (ymm11 ^ ymm24))
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm15[2],xmm8[3,4],xmm15[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm8, %xmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm14, %xmm14
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm13[3,4,5,6,7],ymm9[8,9,10],ymm13[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm16
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm23 ^ (ymm2 & (ymm22 ^ ymm23))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm13[0,1],xmm9[2],xmm13[3,4],xmm9[5],xmm13[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm10
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm21 ^ (ymm4 & (ymm20 ^ ymm21))
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm8
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm7 & (ymm5 ^ ymm18))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm12[0,1],xmm1[2],xmm12[3,4],xmm1[5],xmm12[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm24 ^ (ymm0 & (ymm11 ^ ymm24))
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0],xmm6[1],xmm15[2,3],xmm6[4],xmm15[5,6],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm22 ^ (ymm7 & (ymm23 ^ ymm22))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm7[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7],ymm3[8],ymm7[9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm13[2],xmm9[3,4],xmm13[5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm20 ^ ymm21))
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm14[0],xmm10[1],xmm14[2,3],xmm10[4],xmm14[5,6],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index ae4f85ce42a19..c21bc6927680f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -1312,15 +1312,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm3
 ; AVX512-NEXT:    vpmovqw %zmm3, %xmm6
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm6
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm7
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm6
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm7
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -1334,10 +1334,10 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[2,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
@@ -1348,8 +1348,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
@@ -1365,44 +1365,46 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-FCP-LABEL: load_i16_stride4_vf16:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm7
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14]
-; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm8, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [8,10,10,11,8,10,12,14]
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
+; AVX512-FCP-NEXT:    vpermd %zmm7, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,4,6,12,14]
+; AVX512-FCP-NEXT:    vpermt2d %ymm8, %ymm9, %ymm3
+; AVX512-FCP-NEXT:    vpmovqw %zmm6, %xmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm6, %zmm5
+; AVX512-FCP-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,2,3,1,3,5,7]
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
-; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm8, %ymm5
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm4, %zmm0
-; AVX512-FCP-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $48, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [9,11,10,11,9,11,13,15]
+; AVX512-FCP-NEXT:    vpermd %zmm7, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm9, %ymm0
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $48, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%r8)
@@ -1426,15 +1428,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm3
 ; AVX512DQ-NEXT:    vpmovqw %zmm3, %xmm6
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm6
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm6
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -1448,10 +1450,10 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[2,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
@@ -1462,8 +1464,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
@@ -1479,44 +1481,46 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf16:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm8, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vpermd %zmm7, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,4,6,12,14]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm8, %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm6, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm8, %ymm5
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm7, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%r8)
@@ -2719,8 +2723,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
 ; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm7
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
@@ -2728,68 +2732,68 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm3
 ; AVX512-NEXT:    vpmovqw %ymm3, %xmm3
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm10
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm9
 ; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm10[0,1,0,2,4,5,6,7]
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm4
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7]
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[0,1,2,3]
 ; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm11
-; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm13
-; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm14
+; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm9
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT:    vpsrlq $16, %zmm1, %zmm9
-; AVX512-NEXT:    vpmovqw %zmm9, %xmm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm13
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX512-NEXT:    vpsrlq $16, %zmm0, %zmm9
-; AVX512-NEXT:    vpmovqw %zmm9, %xmm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-NEXT:    vpsrlq $16, %zmm1, %zmm8
+; AVX512-NEXT:    vpmovqw %zmm8, %xmm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm10[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm12[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm11[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm12
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm12[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-NEXT:    vpsrlq $16, %zmm0, %zmm10
+; AVX512-NEXT:    vpmovqw %zmm10, %xmm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[0,1,2,3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm12
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[0,1,2,0,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm10
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6,7]
 ; AVX512-NEXT:    vpsrlq $32, %zmm1, %zmm13
 ; AVX512-NEXT:    vpmovqw %zmm13, %xmm13
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
@@ -2798,15 +2802,15 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm12[2,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
 ; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm14
 ; AVX512-NEXT:    vpmovqw %zmm14, %xmm14
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[0,1,2,3]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm13[0,1,2,3],zmm10[0,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
@@ -2824,7 +2828,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
@@ -2834,7 +2838,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%rsi)
 ; AVX512-NEXT:    vmovdqa64 %zmm5, (%rdx)
-; AVX512-NEXT:    vmovdqa64 %zmm12, (%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm10, (%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%r8)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -2842,79 +2846,81 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-LABEL: load_i16_stride4_vf32:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6]
-; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm6, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm10
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14]
-; AVX512-FCP-NEXT:    vpermt2d %ymm7, %ymm11, %ymm10
-; AVX512-FCP-NEXT:    vpmovqw %zmm1, %xmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm10
-; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm4, %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm12, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm14
-; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm4, %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm4
-; AVX512-FCP-NEXT:    vpermt2d %ymm13, %ymm11, %ymm4
-; AVX512-FCP-NEXT:    vpmovqw %zmm0, %xmm13
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm1, %zmm9
-; AVX512-FCP-NEXT:    vpmovqw %zmm9, %xmm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm12, %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm15, %ymm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm12
-; AVX512-FCP-NEXT:    vpmovqw %zmm12, %xmm12
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,2,2,3,0,2,4,6]
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm7
+; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm5, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm8, %ymm9
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [8,10,10,11,8,10,12,14]
+; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm10, %zmm11
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm12
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,2,2,3,4,6,12,14]
+; AVX512-FCP-NEXT:    vpermt2d %ymm12, %ymm13, %ymm9
+; AVX512-FCP-NEXT:    vpmovqw %zmm1, %xmm12
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm9, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm12
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm8, %ymm13
-; AVX512-FCP-NEXT:    vpermt2d %ymm12, %ymm11, %ymm13
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vpmovqw %zmm12, %xmm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm9, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm3
-; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm11, %ymm3
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vpmovqw %zmm2, %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm12
+; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm5, %ymm14
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm14, %ymm5
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm10, %zmm10
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm15
+; AVX512-FCP-NEXT:    vpermt2d %ymm15, %ymm13, %ymm5
+; AVX512-FCP-NEXT:    vpmovqw %zmm0, %xmm15
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm5[0,1,2,3],zmm9[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm11, %ymm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm1, %zmm11
+; AVX512-FCP-NEXT:    vpmovqw %zmm11, %xmm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm14, %ymm11
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm11
+; AVX512-FCP-NEXT:    vpmovqw %zmm11, %xmm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,2,3,1,3,5,7]
+; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm10, %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm11
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [9,11,10,11,9,11,13,15]
+; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm14, %zmm6
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm15
+; AVX512-FCP-NEXT:    vpermt2d %ymm15, %ymm13, %ymm11
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm1, %zmm15
+; AVX512-FCP-NEXT:    vpmovqw %zmm15, %xmm15
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm3
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm2
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm13, %ymm3
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm11[0,1,2,3]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
 ; AVX512-FCP-NEXT:    vpsrlq $48, %zmm1, %zmm1
 ; AVX512-FCP-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512-FCP-NEXT:    vpsrlq $48, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%r8)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -2930,8 +2936,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm7
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
@@ -2939,68 +2945,68 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm3
 ; AVX512DQ-NEXT:    vpmovqw %ymm3, %xmm3
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm10
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm9
 ; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm10[0,1,0,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm4
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7]
 ; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[0,1,2,3]
 ; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm11
-; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm13
-; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm14
+; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm9
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm1, %zmm9
-; AVX512DQ-NEXT:    vpmovqw %zmm9, %xmm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm13
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm0, %zmm9
-; AVX512DQ-NEXT:    vpmovqw %zmm9, %xmm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm1, %zmm8
+; AVX512DQ-NEXT:    vpmovqw %zmm8, %xmm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm10[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm12[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm11[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm12
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm12[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm0, %zmm10
+; AVX512DQ-NEXT:    vpmovqw %zmm10, %xmm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[0,1,2,3]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm10
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6,7]
 ; AVX512DQ-NEXT:    vpsrlq $32, %zmm1, %zmm13
 ; AVX512DQ-NEXT:    vpmovqw %zmm13, %xmm13
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
@@ -3009,15 +3015,15 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm12[2,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
 ; AVX512DQ-NEXT:    vpsrlq $32, %zmm0, %zmm14
 ; AVX512DQ-NEXT:    vpmovqw %zmm14, %xmm14
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[0,1,2,3]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm13[0,1,2,3],zmm10[0,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
@@ -3035,7 +3041,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
@@ -3045,7 +3051,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%rsi)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%r8)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -3053,79 +3059,81 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm10
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm7, %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm1, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm4, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm12, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm4, %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm4
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm13, %ymm11, %ymm4
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm0, %xmm13
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm1, %zmm9
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm12, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm15, %ymm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,2,2,3,0,2,4,6]
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm5, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm8, %ymm9
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm10, %zmm11
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm12
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,2,2,3,4,6,12,14]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm12, %ymm13, %ymm9
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm1, %xmm12
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm9, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm12
-; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm8, %ymm13
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm12, %ymm11, %ymm13
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm9, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm11, %ymm3
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm5, %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm14, %ymm5
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm15
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm15, %ymm13, %ymm5
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm0, %xmm15
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm5[0,1,2,3],zmm9[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm1, %zmm11
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm11, %xmm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm14, %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm11, %xmm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm10, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm11
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm14, %zmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm15
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm15, %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm1, %zmm15
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm3
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm13, %ymm3
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm11[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
 ; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm1, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%r8)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -5673,259 +5681,256 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-LABEL: load_i16_stride4_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $200, %rsp
-; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm26
-; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm27
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm28
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm29
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm23
+; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm24
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm25
+; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm27
 ; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm0
 ; AVX512-NEXT:    vpmovqw %ymm0, %xmm0
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm14
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm13
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vpmovqw %zmm29, %xmm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512-NEXT:    vpmovqw %ymm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm15
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm15[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT:    vpmovqw %zmm27, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX512-NEXT:    vpmovqw %ymm2, %xmm2
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm12
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm11
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm13
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm13[0,2,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vpmovqw %zmm28, %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpmovqw %zmm25, %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 448(%rdi), %ymm0
 ; AVX512-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa64 496(%rdi), %xmm24
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm24[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 480(%rdi), %xmm23
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm23[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vpmovqw %zmm27, %xmm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm1
-; AVX512-NEXT:    vpmovqw %ymm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vmovdqa64 368(%rdi), %xmm31
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm31[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 352(%rdi), %xmm25
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm25[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT:    vpmovqw %zmm26, %xmm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 320(%rdi), %xmm30
-; AVX512-NEXT:    vmovdqa64 336(%rdi), %xmm17
-; AVX512-NEXT:    vmovdqa64 448(%rdi), %xmm18
-; AVX512-NEXT:    vmovdqa64 464(%rdi), %xmm19
-; AVX512-NEXT:    vmovdqa64 64(%rdi), %xmm20
-; AVX512-NEXT:    vmovdqa64 80(%rdi), %xmm21
-; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vpsrlq $16, %zmm29, %zmm6
-; AVX512-NEXT:    vpmovqw %zmm6, %xmm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
+; AVX512-NEXT:    vmovdqa64 496(%rdi), %xmm30
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm30[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 480(%rdi), %xmm31
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm31[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpmovqw %zmm24, %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm4
+; AVX512-NEXT:    vpmovqw %ymm4, %xmm4
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm9
+; AVX512-NEXT:    vmovdqa64 368(%rdi), %xmm28
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm28[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 352(%rdi), %xmm29
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm29[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm5[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX512-NEXT:    vpmovqw %zmm23, %xmm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[0,1,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 336(%rdi), %xmm16
+; AVX512-NEXT:    vmovdqa64 464(%rdi), %xmm17
+; AVX512-NEXT:    vmovdqa64 80(%rdi), %xmm18
+; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm2
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-NEXT:    vpsrlq $16, %zmm27, %zmm9
+; AVX512-NEXT:    vpmovqw %zmm9, %xmm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm21[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm20[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vpsrlq $16, %zmm28, %zmm6
-; AVX512-NEXT:    vpmovqw %zmm6, %xmm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm19[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vpsrlq $16, %zmm27, %zmm3
-; AVX512-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm30[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm7
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm18[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm3[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-NEXT:    vpsrlq $16, %zmm25, %zmm9
+; AVX512-NEXT:    vpmovqw %zmm9, %xmm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm7[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm6
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm17[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 448(%rdi), %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-NEXT:    vpsrlq $16, %zmm24, %zmm7
+; AVX512-NEXT:    vpmovqw %zmm7, %xmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 320(%rdi), %xmm26
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm26[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpsrlq $16, %zmm23, %zmm5
+; AVX512-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm4[0,1,2,3],zmm6[0,1,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm14[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,1,2,0,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm19
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm15[0,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vpsrlq $16, %zmm26, %zmm4
-; AVX512-NEXT:    vpmovqw %zmm4, %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vpsrlq $32, %zmm29, %zmm1
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm21[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm21
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm20[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm16
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vpsrlq $32, %zmm28, %zmm8
-; AVX512-NEXT:    vpmovqw %zmm8, %xmm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm1[0,1,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm24[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm23[3,1,2,3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpsrlq $32, %zmm27, %zmm2
+; AVX512-NEXT:    vpmovqw %zmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm1[0,1,2,0,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm7
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm18[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm18
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm3[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-NEXT:    vpsrlq $32, %zmm25, %zmm7
+; AVX512-NEXT:    vpmovqw %zmm7, %xmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm3[0,1,2,3],zmm4[0,1,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm30[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[0,1,2,0,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm31[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm18 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm31[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm25[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm15[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm19 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm17[3,1,2,3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm30 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm17[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm7[2,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm17 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm28[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm29[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm4[0,1,2,0,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm28 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm16[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm30[3,1,2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm29
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm26[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm9[2,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX512-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm14
-; AVX512-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7]
-; AVX512-NEXT:    vpsrlq $32, %zmm27, %zmm14
+; AVX512-NEXT:    vinserti32x4 $1, %xmm30, %ymm0, %ymm14
+; AVX512-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7]
+; AVX512-NEXT:    vpsrlq $32, %zmm24, %zmm14
 ; AVX512-NEXT:    vpmovqw %zmm14, %xmm14
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm14
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7]
-; AVX512-NEXT:    vpsrlq $32, %zmm26, %zmm14
+; AVX512-NEXT:    vpsrlq $32, %zmm23, %zmm14
 ; AVX512-NEXT:    vpmovqw %zmm14, %xmm14
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm0
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm0
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm15[0,1,3,1,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm12[0,1,3,1,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT:    vpsrlq $48, %zmm29, %zmm3
-; AVX512-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT:    vpsrlq $48, %zmm27, %zmm5
+; AVX512-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm5
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vpsrlq $48, %zmm28, %zmm3
-; AVX512-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vpsrlq $48, %zmm25, %zmm5
+; AVX512-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[0,1,3,1,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vpsrlq $48, %zmm27, %zmm2
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vpsrlq $48, %zmm24, %zmm2
 ; AVX512-NEXT:    vpmovqw %zmm2, %xmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vpsrlq $48, %zmm26, %zmm3
+; AVX512-NEXT:    vpsrlq $48, %zmm23, %zmm3
 ; AVX512-NEXT:    vpmovqw %zmm3, %xmm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3]
@@ -5933,422 +5938,418 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovaps %zmm2, 64(%rsi)
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vmovaps %zmm2, (%rsi)
-; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vmovaps %zmm2, 64(%rdx)
-; AVX512-NEXT:    vmovups (%rsp), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vmovaps %zmm2, (%rdx)
-; AVX512-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
-; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vmovaps %zmm2, (%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm22, 64(%rdx)
+; AVX512-NEXT:    vmovdqa64 %zmm21, (%rdx)
+; AVX512-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm20, (%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, 64(%r8)
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%r8)
-; AVX512-NEXT:    addq $200, %rsp
+; AVX512-NEXT:    popq %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride4_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm22
-; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm25
+; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm25
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm22
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm27
+; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm23
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6]
-; AVX512-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm23
-; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm9, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm24
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm26
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm24
-; AVX512-FCP-NEXT:    vpermd %ymm24, %ymm9, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14]
-; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm7, %ymm1
-; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm26
-; AVX512-FCP-NEXT:    vpermd %ymm26, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm27
-; AVX512-FCP-NEXT:    vpermd %ymm27, %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm12
-; AVX512-FCP-NEXT:    vpermt2d %ymm11, %ymm7, %ymm12
-; AVX512-FCP-NEXT:    vpmovqw %zmm2, %xmm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm1[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 480(%rdi), %ymm28
-; AVX512-FCP-NEXT:    vpermd %ymm28, %ymm9, %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm11, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm17
-; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm9, %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm13
-; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm7, %ymm13
-; AVX512-FCP-NEXT:    vpmovqw %zmm25, %xmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm18
-; AVX512-FCP-NEXT:    vpermd %ymm18, %ymm9, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm15
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm28
+; AVX512-FCP-NEXT:    vpermd %ymm28, %ymm0, %ymm13
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm13, %ymm5
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [8,10,10,11,8,10,12,14]
+; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm11, %zmm4
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,4,6,12,14]
+; AVX512-FCP-NEXT:    vpermt2d %ymm8, %ymm9, %ymm5
+; AVX512-FCP-NEXT:    vpmovqw %zmm3, %xmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm0, %ymm10
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm12
+; AVX512-FCP-NEXT:    vpermd %zmm24, %zmm11, %zmm5
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm14
+; AVX512-FCP-NEXT:    vpermt2d %ymm14, %ymm9, %ymm12
+; AVX512-FCP-NEXT:    vpmovqw %zmm2, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm12[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm18
+; AVX512-FCP-NEXT:    vpermd %ymm18, %ymm0, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm14
+; AVX512-FCP-NEXT:    vpermd %zmm23, %zmm11, %zmm12
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm1
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm9, %ymm14
+; AVX512-FCP-NEXT:    vpmovqw %zmm27, %xmm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm19
-; AVX512-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm13
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm13, %ymm9
-; AVX512-FCP-NEXT:    vpermt2d %ymm15, %ymm7, %ymm9
-; AVX512-FCP-NEXT:    vpmovqw %zmm22, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm14[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm4, %zmm10
-; AVX512-FCP-NEXT:    vpmovqw %zmm10, %xmm10
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm2, %zmm8
-; AVX512-FCP-NEXT:    vpmovqw %zmm8, %xmm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm25, %zmm3
-; AVX512-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm22, %zmm3
-; AVX512-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm14, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vpermd %ymm24, %ymm14, %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
-; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm7, %ymm1
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm4, %zmm0
-; AVX512-FCP-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm26, %ymm14, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm12
-; AVX512-FCP-NEXT:    vpermd %ymm27, %ymm14, %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm11, %ymm13
-; AVX512-FCP-NEXT:    vpermt2d %ymm12, %ymm7, %ymm13
+; AVX512-FCP-NEXT:    vpermd %ymm19, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm15
+; AVX512-FCP-NEXT:    vpermd %zmm22, %zmm11, %zmm14
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm14, %ymm11
+; AVX512-FCP-NEXT:    vpermt2d %ymm11, %ymm9, %ymm15
+; AVX512-FCP-NEXT:    vpmovqw %zmm25, %xmm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm11[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm13, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm3, %zmm4
+; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm4[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm12, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm27, %zmm4
+; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm14, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm25, %zmm4
+; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX512-FCP-NEXT:    vpermd %ymm28, %ymm0, %ymm15
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm15, %ymm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [9,11,10,11,9,11,13,15]
+; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm28, %zmm4
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm5
+; AVX512-FCP-NEXT:    vpermt2d %ymm5, %ymm9, %ymm1
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm3, %zmm5
+; AVX512-FCP-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm0, %ymm10
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm8
+; AVX512-FCP-NEXT:    vpermd %zmm24, %zmm28, %zmm5
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm12
+; AVX512-FCP-NEXT:    vpermt2d %ymm12, %ymm9, %ymm8
 ; AVX512-FCP-NEXT:    vpsrlq $32, %zmm2, %zmm12
 ; AVX512-FCP-NEXT:    vpmovqw %zmm12, %xmm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm12[0,1,2,3],zmm1[0,1,2,3]
-; AVX512-FCP-NEXT:    vpermd %ymm28, %ymm14, %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm12, %ymm1
-; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm14, %ymm13
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm13, %ymm10
-; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm7, %ymm10
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm25, %zmm1
-; AVX512-FCP-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm18, %ymm14, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm15
-; AVX512-FCP-NEXT:    vpermd %ymm19, %ymm14, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm6
-; AVX512-FCP-NEXT:    vpermt2d %ymm15, %ymm7, %ymm6
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm22, %zmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm8[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT:    vpermd %ymm18, %ymm0, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpermd %zmm23, %zmm28, %zmm14
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm14, %ymm13
+; AVX512-FCP-NEXT:    vpermt2d %ymm13, %ymm9, %ymm1
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm27, %zmm13
+; AVX512-FCP-NEXT:    vpmovqw %zmm13, %xmm13
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm19, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm13
+; AVX512-FCP-NEXT:    vpermd %zmm22, %zmm28, %zmm6
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm7
+; AVX512-FCP-NEXT:    vpermt2d %ymm7, %ymm9, %ymm13
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm25, %zmm7
 ; AVX512-FCP-NEXT:    vpmovqw %zmm7, %xmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $48, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm15, %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $48, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpmovqw %zmm3, %xmm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512-FCP-NEXT:    vpsrlq $48, %zmm2, %zmm2
 ; AVX512-FCP-NEXT:    vpmovqw %zmm2, %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[0,1,2,3]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm8, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm14, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $48, %zmm27, %zmm4
+; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm6, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpsrlq $48, %zmm25, %zmm4
+; AVX512-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $48, %zmm25, %zmm3
-; AVX512-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpsrlq $48, %zmm22, %zmm3
-; AVX512-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%r8)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride4_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $200, %rsp
-; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm26
-; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm27
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm28
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm29
+; AVX512DQ-NEXT:    pushq %rax
+; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm23
+; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm24
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm25
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm27
 ; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpmovqw %ymm0, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa 240(%rdi), %xmm14
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm13
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpmovqw %zmm29, %xmm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512DQ-NEXT:    vpmovqw %ymm1, %xmm1
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm15
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm15[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpmovqw %zmm27, %xmm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX512DQ-NEXT:    vpmovqw %ymm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm12
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm11
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm13
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm13[0,2,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vpmovqw %zmm28, %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpmovqw %zmm25, %xmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 496(%rdi), %xmm24
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm24[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 480(%rdi), %xmm23
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm23[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpmovqw %zmm27, %xmm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm1
-; AVX512DQ-NEXT:    vpmovqw %ymm1, %xmm1
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 368(%rdi), %xmm31
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm31[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 352(%rdi), %xmm25
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm25[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT:    vpmovqw %zmm26, %xmm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %xmm30
-; AVX512DQ-NEXT:    vmovdqa64 336(%rdi), %xmm17
-; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %xmm18
-; AVX512DQ-NEXT:    vmovdqa64 464(%rdi), %xmm19
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %xmm20
-; AVX512DQ-NEXT:    vmovdqa64 80(%rdi), %xmm21
-; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm29, %zmm6
-; AVX512DQ-NEXT:    vpmovqw %zmm6, %xmm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 496(%rdi), %xmm30
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm30[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 480(%rdi), %xmm31
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm31[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpmovqw %zmm24, %xmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm4
+; AVX512DQ-NEXT:    vpmovqw %ymm4, %xmm4
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vmovdqa64 368(%rdi), %xmm28
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm28[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 352(%rdi), %xmm29
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm29[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm5[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-NEXT:    vpmovqw %zmm23, %xmm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 336(%rdi), %xmm16
+; AVX512DQ-NEXT:    vmovdqa64 464(%rdi), %xmm17
+; AVX512DQ-NEXT:    vmovdqa64 80(%rdi), %xmm18
+; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm2
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm27, %zmm9
+; AVX512DQ-NEXT:    vpmovqw %zmm9, %xmm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm21[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm20[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm28, %zmm6
-; AVX512DQ-NEXT:    vpmovqw %zmm6, %xmm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm19[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm27, %zmm3
-; AVX512DQ-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm30[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm18[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm3[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm25, %zmm9
+; AVX512DQ-NEXT:    vpmovqw %zmm9, %xmm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm7[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm17[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm24, %zmm7
+; AVX512DQ-NEXT:    vpmovqw %zmm7, %xmm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %xmm26
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm26[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm23, %zmm5
+; AVX512DQ-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm4[0,1,2,3],zmm6[0,1,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm14[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm19
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm15[0,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm26, %zmm4
-; AVX512DQ-NEXT:    vpmovqw %zmm4, %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vpsrlq $32, %zmm29, %zmm1
-; AVX512DQ-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm21[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm21
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm20[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm16
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vpsrlq $32, %zmm28, %zmm8
-; AVX512DQ-NEXT:    vpmovqw %zmm8, %xmm8
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm1[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm24[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm23[3,1,2,3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpsrlq $32, %zmm27, %zmm2
+; AVX512DQ-NEXT:    vpmovqw %zmm2, %xmm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm1[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm18[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm18
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm3[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-NEXT:    vpsrlq $32, %zmm25, %zmm7
+; AVX512DQ-NEXT:    vpmovqw %zmm7, %xmm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm3[0,1,2,3],zmm4[0,1,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm30[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm31[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm18 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm31[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm25[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm15[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm19 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm17[3,1,2,3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm30 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm17[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm7[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm17 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm28[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm29[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm4[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm28 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm16[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm30[3,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm29
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm26[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm9[2,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm14
-; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7]
-; AVX512DQ-NEXT:    vpsrlq $32, %zmm27, %zmm14
+; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm30, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7]
+; AVX512DQ-NEXT:    vpsrlq $32, %zmm24, %zmm14
 ; AVX512DQ-NEXT:    vpmovqw %zmm14, %xmm14
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm14
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7]
-; AVX512DQ-NEXT:    vpsrlq $32, %zmm26, %zmm14
+; AVX512DQ-NEXT:    vpsrlq $32, %zmm23, %zmm14
 ; AVX512DQ-NEXT:    vpmovqw %zmm14, %xmm14
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm0
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm15[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm12[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT:    vpsrlq $48, %zmm29, %zmm3
-; AVX512DQ-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm3
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpsrlq $48, %zmm27, %zmm5
+; AVX512DQ-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm5
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpsrlq $48, %zmm28, %zmm3
-; AVX512DQ-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vpsrlq $48, %zmm25, %zmm5
+; AVX512DQ-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpsrlq $48, %zmm27, %zmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vpsrlq $48, %zmm24, %zmm2
 ; AVX512DQ-NEXT:    vpmovqw %zmm2, %xmm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vpsrlq $48, %zmm26, %zmm3
+; AVX512DQ-NEXT:    vpsrlq $48, %zmm23, %zmm3
 ; AVX512DQ-NEXT:    vpmovqw %zmm3, %xmm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3]
@@ -6356,164 +6357,163 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovaps %zmm2, 64(%rsi)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovaps %zmm2, 64(%rdx)
-; AVX512DQ-NEXT:    vmovups (%rsp), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovaps %zmm2, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
-; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovaps %zmm2, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm21, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%r8)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%r8)
-; AVX512DQ-NEXT:    addq $200, %rsp
+; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm23
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm9, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm24
-; AVX512DQ-FCP-NEXT:    vpermd %ymm24, %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm7, %ymm1
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm26
-; AVX512DQ-FCP-NEXT:    vpermd %ymm26, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm27
-; AVX512DQ-FCP-NEXT:    vpermd %ymm27, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm11, %ymm7, %ymm12
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm2, %xmm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm1[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 480(%rdi), %ymm28
-; AVX512DQ-FCP-NEXT:    vpermd %ymm28, %ymm9, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm17
-; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm9, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm13
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm7, %ymm13
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm25, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm18
-; AVX512DQ-FCP-NEXT:    vpermd %ymm18, %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm28
+; AVX512DQ-FCP-NEXT:    vpermd %ymm28, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm13, %ymm5
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,4,6,12,14]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm8, %ymm9, %ymm5
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm3, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm12
+; AVX512DQ-FCP-NEXT:    vpermd %zmm24, %zmm11, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm14
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm14, %ymm9, %ymm12
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm2, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm12[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm18
+; AVX512DQ-FCP-NEXT:    vpermd %ymm18, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm14
+; AVX512DQ-FCP-NEXT:    vpermd %zmm23, %zmm11, %zmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm9, %ymm14
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm27, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm19
-; AVX512DQ-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm13, %ymm9
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm15, %ymm7, %ymm9
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm22, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm14[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm4, %zmm10
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm25, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vpermd %ymm24, %ymm14, %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm7, %ymm1
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm26, %ymm14, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT:    vpermd %ymm27, %ymm14, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm11, %ymm13
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm12, %ymm7, %ymm13
+; AVX512DQ-FCP-NEXT:    vpermd %ymm19, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm15
+; AVX512DQ-FCP-NEXT:    vpermd %zmm22, %zmm11, %zmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm14, %ymm11
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm11, %ymm9, %ymm15
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm25, %xmm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm11[0,1,2,3],zmm1[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm4[0,1,2,3],zmm1[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm12, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm27, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm14, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm1[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm28, %ymm0, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm28, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm5
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm5, %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm3, %zmm5
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm8
+; AVX512DQ-FCP-NEXT:    vpermd %zmm24, %zmm28, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm12
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm12, %ymm9, %ymm8
 ; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm2, %zmm12
 ; AVX512DQ-FCP-NEXT:    vpmovqw %zmm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm12[0,1,2,3],zmm1[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm28, %ymm14, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm12, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm14, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm13, %ymm10
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm7, %ymm10
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm25, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm18, %ymm14, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT:    vpermd %ymm19, %ymm14, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm6
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm15, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm22, %zmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm8[0,1,2,3],zmm1[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm18, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermd %zmm23, %zmm28, %zmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm14, %ymm13
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm13, %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm27, %zmm13
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm13, %xmm13
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm19, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vpermd %zmm22, %zmm28, %zmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm7, %ymm9, %ymm13
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm25, %zmm7
 ; AVX512DQ-FCP-NEXT:    vpmovqw %zmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm15, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm2, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpmovqw %zmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm8, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm14, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm27, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm6, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm25, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%r8)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 739e6e2369e36..bb502c79f6ff9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -377,85 +377,101 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX2-LABEL: load_i16_stride5_vf4:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-NEXT:    vmovq %xmm5, (%rcx)
-; AVX2-NEXT:    vmovq %xmm6, (%r8)
-; AVX2-NEXT:    vmovq %xmm0, (%r9)
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[3,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm7 = xmm4[0],xmm2[1,2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm2[2],xmm3[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7]
+; AVX2-NEXT:    vmovq %xmm5, (%rsi)
+; AVX2-NEXT:    vmovq %xmm6, (%rdx)
+; AVX2-NEXT:    vmovq %xmm7, (%rcx)
+; AVX2-NEXT:    vmovq %xmm0, (%r8)
+; AVX2-NEXT:    vmovq %xmm1, (%r9)
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride5_vf4:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm6, (%r8)
-; AVX2-FP-NEXT:    vmovq %xmm0, (%r9)
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm3[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm4[0],xmm2[1,2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm2[2],xmm3[3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7]
+; AVX2-FP-NEXT:    vmovq %xmm5, (%rsi)
+; AVX2-FP-NEXT:    vmovq %xmm6, (%rdx)
+; AVX2-FP-NEXT:    vmovq %xmm7, (%rcx)
+; AVX2-FP-NEXT:    vmovq %xmm0, (%r8)
+; AVX2-FP-NEXT:    vmovq %xmm1, (%r9)
+; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride5_vf4:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm6, (%r8)
-; AVX2-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm3[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,1,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,5,2,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovq %xmm5, (%rsi)
+; AVX2-FCP-NEXT:    vmovq %xmm6, (%rdx)
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX2-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX2-FCP-NEXT:    vmovq %xmm1, (%r9)
+; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i16_stride5_vf4:
@@ -470,27 +486,30 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
 ; AVX512-NEXT:    vpextrw $6, %xmm0, %eax
 ; AVX512-NEXT:    vpextrw $1, %xmm0, %r10d
-; AVX512-NEXT:    vmovd %r10d, %xmm4
-; AVX512-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; AVX512-NEXT:    vmovd %r10d, %xmm0
+; AVX512-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX512-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm1
+; AVX512-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm2, %eax
-; AVX512-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [8,1,6,3]
+; AVX512-NEXT:    vpermd (%rdi), %zmm1, %zmm1
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3,4,5,6,7,8,9],mem[10],ymm4[11,12,13,14,15]
+; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5,6,7]
 ; AVX512-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512-NEXT:    vmovq %xmm6, (%r8)
-; AVX512-NEXT:    vmovq %xmm0, (%r9)
+; AVX512-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512-NEXT:    vmovq %xmm4, (%r8)
+; AVX512-NEXT:    vmovq %xmm2, (%r9)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride5_vf4:
@@ -503,25 +522,28 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpextrw $7, %xmm1, %eax
 ; AVX512-FCP-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
 ; AVX512-FCP-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; AVX512-FCP-NEXT:    vmovd %xmm2, %eax
-; AVX512-FCP-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [8,1,6,3]
+; AVX512-FCP-NEXT:    vpermd (%rdi), %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,1,6,7]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1],mem[2],ymm5[3,4,5,6,7,8,9],mem[10],ymm5[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,5,2,7]
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
-; AVX512-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512-FCP-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512-FCP-NEXT:    vmovq %xmm4, (%r8)
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride5_vf4:
@@ -536,27 +558,30 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
 ; AVX512DQ-NEXT:    vpextrw $6, %xmm0, %eax
 ; AVX512DQ-NEXT:    vpextrw $1, %xmm0, %r10d
-; AVX512DQ-NEXT:    vmovd %r10d, %xmm4
-; AVX512DQ-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vmovd %r10d, %xmm0
+; AVX512DQ-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX512DQ-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm1
+; AVX512DQ-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovd %xmm2, %eax
-; AVX512DQ-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [8,1,6,3]
+; AVX512DQ-NEXT:    vpermd (%rdi), %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3,4,5,6,7,8,9],mem[10],ymm4[11,12,13,14,15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512DQ-NEXT:    vmovq %xmm6, (%r8)
-; AVX512DQ-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-NEXT:    vmovq %xmm4, (%r8)
+; AVX512DQ-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf4:
@@ -569,124 +594,143 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpextrw $7, %xmm1, %eax
 ; AVX512DQ-FCP-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovd %xmm2, %eax
-; AVX512DQ-FCP-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [8,1,6,3]
+; AVX512DQ-FCP-NEXT:    vpermd (%rdi), %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,1,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1],mem[2],ymm5[3,4,5,6,7,8,9],mem[10],ymm5[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,5,2,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i16_stride5_vf4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX512BW-NEXT:    vpextrw $7, %xmm3, %eax
-; AVX512BW-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512BW-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512BW-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
+; AVX512BW-NEXT:    vpextrw $7, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512BW-NEXT:    vpextrw $1, %xmm1, %r10d
+; AVX512BW-NEXT:    vmovd %r10d, %xmm1
+; AVX512BW-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrw $3, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm5, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512BW-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512BW-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm1, (%r9)
+; AVX512BW-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vpextrw $7, %xmm3, %eax
-; AVX512BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX512BW-FCP-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpextrw $7, %xmm2, %eax
+; AVX512BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT:    vpextrw $3, %xmm2, %eax
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT:    vpinsrw $3, 32(%rdi), %xmm1, %xmm1
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm2, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm4, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm5, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%r9)
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
 ; AVX512DQ-BW:       # %bb.0:
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-BW-NEXT:    vpextrw $7, %xmm3, %eax
-; AVX512DQ-BW-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512DQ-BW-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpextrw $7, %xmm2, %eax
+; AVX512DQ-BW-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512DQ-BW-NEXT:    vpextrw $1, %xmm1, %r10d
+; AVX512DQ-BW-NEXT:    vmovd %r10d, %xmm1
+; AVX512DQ-BW-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vpextrw $3, %xmm2, %eax
+; AVX512DQ-BW-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm2, %zmm2
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm4, %zmm4
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm5, %zmm1
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm5, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%r9)
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpextrw $7, %xmm3, %eax
-; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpextrw $7, %xmm2, %eax
+; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpextrw $3, %xmm2, %eax
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, 32(%rdi), %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm2, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm4, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm5, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <20 x i16>, ptr %in.vec, align 64
@@ -984,9 +1028,9 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [4,1,6,3]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm8, %ymm8
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
@@ -1031,7 +1075,8 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,10,11,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vprolq $16, %xmm6, %xmm6
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
@@ -1074,12 +1119,12 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,1,6,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm6, %ymm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,10,11,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vprolq $16, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
 ; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
@@ -1123,7 +1168,8 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,10,11,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vprolq $16, %xmm6, %xmm6
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
@@ -1166,12 +1212,12 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,1,6,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,10,11,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vprolq $16, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
@@ -1513,50 +1559,50 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,7]
-; AVX-NEXT:    vmovdqa 112(%rdi), %xmm1
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm1[1]
-; AVX-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
+; AVX-NEXT:    vmovdqa 112(%rdi), %xmm2
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm4
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa 144(%rdi), %xmm8
 ; AVX-NEXT:    vmovdqa 128(%rdi), %xmm7
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4],xmm4[5,6,7]
-; AVX-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm4
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm1[0,1,2,3,4],xmm3[5,6,7]
+; AVX-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm5
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm6
-; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm5[4],xmm9[5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[3,1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm6[0,1,0,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm5[4],xmm10[5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm3[3,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX-NEXT:    vandps %ymm11, %ymm9, %ymm12
-; AVX-NEXT:    vmovaps 64(%rdi), %xmm9
-; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm9[0,1,0,1]
+; AVX-NEXT:    vandps %ymm11, %ymm10, %ymm12
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm10
+; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm10[0,1,0,1]
 ; AVX-NEXT:    vandnps %ymm13, %ymm11, %ymm13
 ; AVX-NEXT:    vorps %ymm13, %ymm12, %ymm12
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm12, %ymm10
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm12, %ymm9
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,0,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm2[0,3,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm4[0,3,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
-; AVX-NEXT:    vpsllq $48, %xmm9, %xmm13
+; AVX-NEXT:    vpsllq $48, %xmm10, %xmm13
 ; AVX-NEXT:    vandnps %ymm13, %ymm11, %ymm13
-; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm14
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm3[0,3,2,3]
+; AVX-NEXT:    vpsrlq $48, %xmm3, %xmm14
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
@@ -1565,38 +1611,38 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vandps %ymm11, %ymm14, %ymm11
 ; AVX-NEXT:    vorps %ymm13, %ymm11, %ymm11
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm11, %ymm11
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm4[3,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm3[0,1,1,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[0,1,1,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7]
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,0]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm10[0,1,2,0]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u]
-; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm14
+; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm14
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm1[2,3],xmm3[4,5,6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm10[0,1,0,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
@@ -1606,22 +1652,23 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,1,1,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovaps %ymm10, (%rsi)
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm7[4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,12,13,14,15]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps %ymm9, (%rsi)
 ; AVX-NEXT:    vmovaps %ymm11, (%rdx)
 ; AVX-NEXT:    vmovaps %ymm12, (%rcx)
 ; AVX-NEXT:    vmovaps %ymm13, (%r8)
@@ -1643,11 +1690,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-NEXT:    vpblendvb %ymm7, %ymm4, %ymm5, %ymm5
-; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm6
-; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm4
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0],xmm6[1],xmm4[2,3]
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm5
+; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm7
+; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm4
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0],xmm4[1],xmm7[2,3]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15]
@@ -1660,12 +1707,12 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-NEXT:    vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
+; AVX2-NEXT:    vpblendvb %ymm6, %ymm8, %ymm9, %ymm6
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm4[2],xmm7[3]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
 ; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
 ; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
@@ -1675,7 +1722,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm7[1],xmm4[2,3]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
 ; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
@@ -1689,11 +1736,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm7[2],xmm4[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
@@ -1703,17 +1750,17 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovdqa %ymm5, (%rsi)
-; AVX2-NEXT:    vmovdqa %ymm7, (%rdx)
+; AVX2-NEXT:    vmovdqa %ymm6, (%rdx)
 ; AVX2-NEXT:    vmovdqa %ymm8, (%rcx)
-; AVX2-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX2-NEXT:    vmovdqa %ymm7, (%r8)
 ; AVX2-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1732,15 +1779,15 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm5, %ymm6
-; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm4
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm7
+; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0],xmm5[1],xmm7[2,3]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
@@ -1749,12 +1796,12 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2],xmm5[3]
+; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm8, %ymm9, %ymm6
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm5[2],xmm7[3]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
@@ -1764,7 +1811,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm5[0],xmm7[1],xmm5[2,3]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
@@ -1778,11 +1825,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm4[0,1],xmm5[2],xmm4[3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm7[2],xmm5[3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
@@ -1792,15 +1839,16 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[0,1,2,3,0,1,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm2
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm6, (%rsi)
-; AVX2-FP-NEXT:    vmovdqa %ymm7, (%rdx)
+; AVX2-FP-NEXT:    vmovdqa %ymm4, (%rsi)
+; AVX2-FP-NEXT:    vmovdqa %ymm6, (%rdx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm8, (%rcx)
-; AVX2-FP-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX2-FP-NEXT:    vmovdqa %ymm7, (%r8)
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
@@ -1858,9 +1906,9 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm7
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,1,6,3]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
+; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm9, %ymm9
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
@@ -1909,27 +1957,27 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm5
-; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm6
-; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm7
+; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm4[1],xmm7[2,3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %ymm7, %ymm8, %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
+; AVX512-NEXT:    vpor %ymm6, %ymm8, %ymm6
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm4[2],xmm7[3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
@@ -1939,7 +1987,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm7[1],xmm4[2,3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
 ; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
@@ -1953,11 +2001,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm7[2],xmm4[3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
@@ -1967,17 +2015,17 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-NEXT:    vmovdqa %ymm5, (%rsi)
-; AVX512-NEXT:    vmovdqa %ymm7, (%rdx)
+; AVX512-NEXT:    vmovdqa %ymm6, (%rdx)
 ; AVX512-NEXT:    vmovdqa %ymm8, (%rcx)
-; AVX512-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512-NEXT:    vmovdqa %ymm7, (%r8)
 ; AVX512-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -2034,9 +2082,9 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm7
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,1,6,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm9, %ymm9
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
@@ -2060,14 +2108,13 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7]
 ; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14]
-; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -2086,27 +2133,27 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm6
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm4[1],xmm7[2,3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %ymm7, %ymm8, %ymm7
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
+; AVX512DQ-NEXT:    vpor %ymm6, %ymm8, %ymm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm4[2],xmm7[3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
@@ -2116,7 +2163,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm7[1],xmm4[2,3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
@@ -2130,11 +2177,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm7[2],xmm4[3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
@@ -2144,17 +2191,17 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vmovdqa %ymm5, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %ymm7, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm6, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %ymm8, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512DQ-NEXT:    vmovdqa %ymm7, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -2211,9 +2258,9 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm7
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,1,6,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm9, %ymm9
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
@@ -2237,14 +2284,13 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -2887,18 +2933,19 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-LABEL: load_i16_stride5_vf32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    subq $424, %rsp # imm = 0x1A8
-; AVX-NEXT:    vmovdqa 144(%rdi), %xmm9
-; AVX-NEXT:    vmovdqa 128(%rdi), %xmm7
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7]
+; AVX-NEXT:    vmovdqa 144(%rdi), %xmm11
+; AVX-NEXT:    vmovdqa 128(%rdi), %xmm9
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm11[2,3],xmm9[4,5,6,7]
 ; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa 96(%rdi), %xmm11
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm11[0,1,1,3]
+; AVX-NEXT:    vmovdqa 96(%rdi), %xmm7
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,1,1,3]
+; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
-; AVX-NEXT:    vmovdqa 112(%rdi), %xmm10
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1]
-; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 112(%rdi), %xmm8
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1]
+; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
@@ -2907,40 +2954,38 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm5
 ; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm12
-; AVX-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 48(%rdi), %xmm15
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[0,1,0,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3]
-; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm13
+; AVX-NEXT:    vmovdqa 32(%rdi), %xmm14
+; AVX-NEXT:    vmovdqa 48(%rdi), %xmm10
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[0,1,0,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX-NEXT:    vandps %ymm6, %ymm3, %ymm3
-; AVX-NEXT:    vmovaps 64(%rdi), %xmm5
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[0,1,0,1]
-; AVX-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vandnps %ymm4, %ymm6, %ymm4
+; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
+; AVX-NEXT:    vandps %ymm5, %ymm3, %ymm3
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; AVX-NEXT:    vandnps %ymm4, %ymm5, %ymm4
 ; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 304(%rdi), %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 288(%rdi), %xmm13
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3],xmm13[4,5,6,7]
-; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 304(%rdi), %xmm12
+; AVX-NEXT:    vmovdqa 288(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm12[2,3],xmm0[4,5,6,7]
+; AVX-NEXT:    vmovdqa %xmm12, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vmovdqa 256(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
 ; AVX-NEXT:    vmovdqa 272(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
 ; AVX-NEXT:    vmovdqa 240(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2960,703 +3005,714 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa 208(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX-NEXT:    vmovdqa 192(%rdi), %xmm14
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7]
-; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 192(%rdi), %xmm15
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6,7]
+; AVX-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vandps %ymm6, %ymm0, %ymm0
+; AVX-NEXT:    vandps %ymm5, %ymm0, %ymm0
 ; AVX-NEXT:    vmovaps 224(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1]
-; AVX-NEXT:    vandnps %ymm8, %ymm6, %ymm8
-; AVX-NEXT:    vorps %ymm0, %ymm8, %ymm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1]
+; AVX-NEXT:    vandnps %ymm6, %ymm5, %ymm6
+; AVX-NEXT:    vorps %ymm6, %ymm0, %ymm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa %xmm11, %xmm6
-; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[0,3,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa %xmm11, %xmm5
+; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm11[4,5],xmm9[6,7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[0,3,2,3]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpsrlq $48, %xmm12, %xmm9
+; AVX-NEXT:    vmovdqa %xmm13, %xmm4
+; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpsrlq $48, %xmm13, %xmm9
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
-; AVX-NEXT:    vmovdqa %xmm15, %xmm12
-; AVX-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm15[2,3],xmm2[4,5],xmm15[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm14[0,1],xmm10[2,3],xmm14[4,5],xmm10[6,7]
+; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5,6,7]
-; AVX-NEXT:    vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX-NEXT:    vandps %ymm3, %ymm10, %ymm3
-; AVX-NEXT:    vpsllq $48, %xmm5, %xmm9
-; AVX-NEXT:    vandnps %ymm9, %ymm10, %ymm9
+; AVX-NEXT:    vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
+; AVX-NEXT:    vandps %ymm3, %ymm11, %ymm3
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vpsllq $48, %xmm8, %xmm9
+; AVX-NEXT:    vandnps %ymm9, %ymm11, %ymm9
 ; AVX-NEXT:    vorps %ymm3, %ymm9, %ymm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm5[4,5],xmm13[6,7]
-; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa (%rsp), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm12[4,5],xmm13[6,7]
+; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm11[0,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm13[2,3],xmm14[4,5],xmm13[6,7]
+; AVX-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5],mem[6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
 ; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm3 = mem[0,3,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpsrlq $48, %xmm8, %xmm8
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT:    vpsrlq $48, %xmm6, %xmm6
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7]
-; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX-NEXT:    vandps %ymm1, %ymm8, %ymm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vpsllq $48, %xmm14, %xmm3
-; AVX-NEXT:    vandnps %ymm3, %ymm8, %ymm3
+; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
+; AVX-NEXT:    vandps %ymm6, %ymm1, %ymm1
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT:    vpsllq $48, %xmm15, %xmm3
+; AVX-NEXT:    vandnps %ymm3, %ymm6, %ymm3
 ; AVX-NEXT:    vorps %ymm3, %ymm1, %ymm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[3,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,1],xmm7[2,3],mem[4,5,6,7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[0,1,1,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm10[2,3],xmm14[4,5,6,7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[0,1,1,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3,4,5],xmm9[6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[0,1,2,0]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm8[0,1,2,0]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vmovdqa %xmm5, %xmm9
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm7, %xmm0, %xmm4
+; AVX-NEXT:    vmovdqa %xmm13, %xmm2
+; AVX-NEXT:    vmovdqa (%rsp), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm13[2,3],xmm9[4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm6
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vmovdqa %xmm15, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm5[4,5],xmm15[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3],xmm5[4,5],xmm12[6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vmovdqa %xmm10, %xmm2
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm4[5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[0,1,1,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4],xmm6[5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[0,1,1,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,0]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[0,1,2,0]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm1, %ymm1
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm12[2,3],xmm5[4,5],xmm12[6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
-; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm3
+; AVX-NEXT:    vpsrlq $48, %xmm11, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm2[4,5],xmm9[6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
 ; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
 ; AVX-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm13[4,5],xmm15[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3],xmm13[4,5],xmm14[6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5],xmm3[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[0,1,0,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm15[0,1,0,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7]
-; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm12[4,5],xmm11[6,7]
+; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm7
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm3
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX-NEXT:    vmovdqa %xmm11, %xmm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm7[5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm7[2,3],xmm4[4,5,6,7]
 ; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm11[4,5],xmm6[6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5],xmm1[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,3]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm7
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5],xmm0[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm11[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm1, %ymm10
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3],xmm1[4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm7[4,5],xmm4[6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm9[1,1,1,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5],xmm3[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm12[0,1,1,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm8[0,1,1,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm5
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm11[3,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm6
+; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[0,3,2,3]
+; AVX-NEXT:    vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm1[0,1,2],mem[3],xmm1[4,5,6,7]
+; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = mem[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
+; AVX-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm14[1,1,1,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[0,1,1,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm3
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm3 = mem[3,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
-; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[0,3,2,3]
-; AVX-NEXT:    vpblendw $8, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7]
-; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
-; AVX-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm3, 32(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm3, (%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm3, 32(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm3, (%rdx)
+; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, 32(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, (%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, 32(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, (%rdx)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, 32(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, (%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 32(%r8)
-; AVX-NEXT:    vmovaps %ymm7, (%r8)
-; AVX-NEXT:    vmovaps %ymm2, 32(%r9)
-; AVX-NEXT:    vmovaps %ymm1, (%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, (%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, 32(%r8)
+; AVX-NEXT:    vmovaps %ymm10, (%r8)
+; AVX-NEXT:    vmovaps %ymm1, 32(%r9)
+; AVX-NEXT:    vmovaps %ymm6, (%r9)
 ; AVX-NEXT:    addq $424, %rsp # imm = 0x1A8
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: load_i16_stride5_vf32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX2-NEXT:    subq $232, %rsp
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm15
-; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm5
-; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm15
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm4
+; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6],ymm9[7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
+; AVX2-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4,5],xmm10[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-NEXT:    vpblendvb %ymm10, %ymm9, %ymm0, %ymm9
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0],ymm7[1,2],ymm15[3],ymm7[4],ymm15[5],ymm7[6,7],ymm15[8],ymm7[9,10],ymm15[11],ymm7[12],ymm15[13],ymm7[14,15]
+; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6],ymm13[7]
+; AVX2-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1,2,3],xmm11[4,5],xmm13[6,7]
+; AVX2-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
+; AVX2-NEXT:    vpblendvb %ymm10, %ymm11, %ymm0, %ymm11
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm8
 ; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm0, %ymm8
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm15[1,2],ymm3[3],ymm15[4],ymm3[5],ymm15[6,7],ymm3[8],ymm15[9,10],ymm3[11],ymm15[12],ymm3[13],ymm15[14,15]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6],ymm12[7]
-; AVX2-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX2-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm12
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1,2,3],xmm10[4,5],xmm12[6,7]
-; AVX2-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
-; AVX2-NEXT:    vpblendvb %ymm9, %ymm10, %ymm0, %ymm12
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm13
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
+; AVX2-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm13 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX2-NEXT:    vpblendvb %ymm9, %ymm11, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
-; AVX2-NEXT:    vmovdqa %ymm15, %ymm5
-; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
-; AVX2-NEXT:    vpshufb %ymm10, %ymm11, %ymm10
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm15
-; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm13
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7]
-; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm13
-; AVX2-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm14
-; AVX2-NEXT:    vpblendvb %ymm9, %ymm11, %ymm10, %ymm9
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm14[0],xmm13[1],xmm14[2,3]
+; AVX2-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
+; AVX2-NEXT:    vpblendvb %ymm10, %ymm13, %ymm0, %ymm13
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
+; AVX2-NEXT:    vpshufb %ymm12, %ymm0, %ymm15
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vmovdqa %ymm1, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm12
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3],xmm0[4,5,6],xmm12[7]
+; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm0
+; AVX2-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
+; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm12
+; AVX2-NEXT:    vpblendvb %ymm10, %ymm14, %ymm15, %ymm14
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm0[0],xmm12[1],xmm0[2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
 ; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm11
-; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm10
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm10[0],xmm11[1],xmm10[2,3]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
+; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm15
+; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm10
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm15[0],xmm10[1],xmm15[2,3]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm13[2],xmm14[3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm11[2],xmm10[3]
-; AVX2-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
-; AVX2-NEXT:    vmovdqa %ymm6, %ymm9
-; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX2-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15]
-; AVX2-NEXT:    vmovdqa %ymm5, %ymm6
-; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX2-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm12[2],xmm0[3]
+; AVX2-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15]
-; AVX2-NEXT:    vmovdqa %ymm4, %ymm7
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX2-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm14[2],xmm13[3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm10[2],xmm15[3]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7]
-; AVX2-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4],ymm5[5],ymm15[6,7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12],ymm5[13],ymm15[14,15]
-; AVX2-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm12
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
-; AVX2-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm10[2],xmm11[3]
-; AVX2-NEXT:    vpshufb %xmm4, %xmm8, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5,6,7],ymm14[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6],ymm1[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15]
+; AVX2-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm7
+; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm14
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm14[3,4],xmm9[5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX2-NEXT:    vpshufb %ymm14, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm12[0],xmm2[1],xmm12[2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm9
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6],ymm9[7]
+; AVX2-NEXT:    vpshufb %ymm14, %ymm9, %ymm9
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm5[2],ymm1[3],ymm5[4],ymm1[5,6],ymm5[7],ymm1[8,9],ymm5[10],ymm1[11],ymm5[12],ymm1[13,14],ymm5[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm14
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm14[3,4],xmm11[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm15[1],xmm10[2,3]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7]
-; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
-; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX2-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4],ymm0[5],ymm7[6,7],ymm0[8],ymm7[9,10],ymm0[11],ymm7[12],ymm0[13],ymm7[14,15]
+; AVX2-NEXT:    vmovdqa %ymm7, %ymm9
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm7
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm11
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2],xmm3[3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX2-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm12[0,1],mem[2],xmm12[3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm2[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5],ymm13[6],ymm2[7]
+; AVX2-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4],ymm4[5],ymm1[6,7],ymm4[8],ymm1[9,10],ymm4[11],ymm1[12],ymm4[13],ymm1[14,15]
+; AVX2-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3]
+; AVX2-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm15[2],xmm10[3]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0,1],ymm8[2],mem[3],ymm8[4],mem[5,6],ymm8[7],mem[8,9],ymm8[10],mem[11],ymm8[12],mem[13,14],ymm8[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4],ymm3[5,6],ymm7[7]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm7
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3,4],xmm7[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm4
+; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm4, 32(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%r8)
-; AVX2-NEXT:    vmovdqa %ymm0, (%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 32(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 32(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, (%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 32(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, (%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 32(%r8)
+; AVX2-NEXT:    vmovdqa %ymm15, (%r8)
 ; AVX2-NEXT:    vmovdqa %ymm2, 32(%r9)
 ; AVX2-NEXT:    vmovdqa %ymm3, (%r9)
-; AVX2-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX2-NEXT:    addq $232, %rsp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride5_vf32:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm13
+; AVX2-FP-NEXT:    subq $280, %rsp # imm = 0x118
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm10
-; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm14
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm3
-; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm4
-; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm5
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10,11],ymm14[12],ymm3[13],ymm14[14],ymm3[15]
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
+; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm11
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3],xmm10[4,5],xmm11[6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm8, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5],ymm8[6],ymm12[7]
-; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
+; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm10, %ymm0, %ymm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6],ymm13[7]
+; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
 ; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm8, %ymm1, %ymm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm13
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm13[1,2,3],xmm8[4,5],xmm13[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
+; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm8, %ymm0, %ymm12
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm3[1],ymm14[2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10],ymm3[11],ymm14[12,13],ymm3[14],ymm14[15]
-; AVX2-FP-NEXT:    vmovdqa %ymm14, %ymm5
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm14
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3],xmm9[4,5,6],xmm14[7]
+; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm9, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15]
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm9, %ymm15
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
-; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm8
-; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm9, %xmm14
-; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm9
+; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
+; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm13, %ymm0, %ymm13
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm0, %ymm15
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm8
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6],xmm8[7]
+; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm0
+; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm8, %xmm14
+; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm14, %ymm15, %ymm11
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm14, %xmm14
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm0[0],xmm3[1],xmm0[2,3]
+; AVX2-FP-NEXT:    vmovdqa %xmm0, %xmm8
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0,1,2,3,4],ymm14[5,6,7],ymm10[8,9,10,11,12],ymm14[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm6
 ; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm15
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3]
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm14
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm15[0],xmm14[1],xmm15[2,3]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2],xmm9[3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm3[2],xmm8[3]
+; AVX2-FP-NEXT:    vmovdqa %xmm3, %xmm6
+; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm6[2],xmm15[3]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm14[2],xmm15[3]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm5, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm11
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm1[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15]
+; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm5
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm11
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0],xmm9[1],xmm8[2,3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13],ymm10[14],ymm7[15]
-; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm10
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3],ymm13[4],ymm7[5,6],ymm13[7],ymm7[8,9],ymm13[10],ymm7[11],ymm13[12],ymm7[13,14],ymm13[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm11
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0],xmm15[1],xmm6[2,3]
-; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm6[0],xmm8[1],xmm6[2,3]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm10, %xmm10
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm11
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6],ymm10[7]
+; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm6[0,1],ymm1[2],ymm6[3],ymm1[4],ymm6[5,6],ymm1[7],ymm6[8,9],ymm1[10],ymm6[11],ymm1[12],ymm6[13,14],ymm1[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4],xmm11[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm11, %xmm11
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0],xmm15[1],xmm14[2,3]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm5
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7]
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm4
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0],ymm7[1,2],ymm13[3],ymm7[4],ymm13[5],ymm7[6,7],ymm13[8],ymm7[9,10],ymm13[11],ymm7[12],ymm13[13],ymm7[14,15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm14
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm6[0,1],xmm15[2],xmm6[3]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm6, %xmm3
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm15, %xmm12
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5],ymm13[6],ymm4[7,8],ymm13[9],ymm4[10,11],ymm13[12],ymm4[13],ymm13[14],ymm4[15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7],ymm10[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm2[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4],ymm2[5],ymm10[6],ymm2[7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15]
+; AVX2-FP-NEXT:    vmovdqa %ymm4, %ymm11
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm12
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm8[2],xmm10[3]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
+; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqa %ymm1, %ymm3
+; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4],ymm1[5],ymm6[6,7],ymm1[8],ymm6[9,10],ymm1[11],ymm6[12],ymm1[13],ymm6[14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm12
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2],xmm2[3]
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm15[2],xmm14[3]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm2, %xmm1
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm2
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0,1],ymm7[2],ymm9[3],ymm7[4],ymm9[5,6],ymm7[7],ymm9[8,9],ymm7[10],ymm9[11],ymm7[12],ymm9[13,14],ymm7[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
 ; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
 ; AVX2-FP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7]
-; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
-; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm8, %xmm5
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm11[0],mem[1],ymm11[2,3],mem[4],ymm11[5],mem[6],ymm11[7,8],mem[9],ymm11[10,11],mem[12],ymm11[13],mem[14],ymm11[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
+; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm4, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm4, (%rcx)
-; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm1, (%r8)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm2, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm2, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm2, (%rcx)
+; AVX2-FP-NEXT:    vmovdqa %ymm8, 32(%r8)
+; AVX2-FP-NEXT:    vmovdqa %ymm15, (%r8)
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm3, (%r9)
-; AVX2-FP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX2-FP-NEXT:    vmovdqa %ymm1, (%r9)
+; AVX2-FP-NEXT:    addq $280, %rsp # imm = 0x118
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride5_vf32:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $296, %rsp # imm = 0x128
+; AVX2-FCP-NEXT:    subq $264, %rsp # imm = 0x108
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15]
-; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3]
 ; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm8
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
@@ -3668,14 +3724,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm13
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm13, %ymm8, %ymm8
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15]
-; AVX2-FCP-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm13, %xmm12
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4],ymm15[5],ymm1[6,7],ymm15[8],ymm1[9,10],ymm15[11],ymm1[12],ymm15[13],ymm1[14,15]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa %ymm15, %ymm2
 ; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm10, %ymm10
 ; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
@@ -3686,13 +3742,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
 ; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm10
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm7
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
 ; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
 ; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm13, %ymm10, %ymm13
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm0
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6],xmm0[7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
@@ -3704,14 +3761,15 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm10, %ymm0
 ; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm14, %ymm9
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
-; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm9
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
+; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm9, %ymm9
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm14, %ymm8
-; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm9
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3728,14 +3786,10 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm9
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm8
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,2,0,0,5,7,2,4]
 ; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm14, %ymm8
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
@@ -3746,65 +3800,64 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0]
 ; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm8, %ymm11
-; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vmovdqa %ymm9, %ymm5
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7],ymm0[8,9,10,11,12],ymm11[13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm6
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8,9],ymm0[10],ymm3[11],ymm0[12],ymm3[13,14],ymm0[15]
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15]
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm9
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm13
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4],xmm11[5,6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
-; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
 ; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm14, %ymm11
 ; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm11, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm8, %ymm2
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
+; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
-; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm7
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm11, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15]
+; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm11
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,0,0,5,0,2,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm13 = [4,1,6,3]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15]
+; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm13, %ymm8
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0]
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm2, %ymm8
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [2,4,7,0,2,4,7,0]
+; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm0, %ymm8
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
+; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15]
-; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm8
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm9
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15]
+; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm8
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
 ; AVX2-FCP-NEXT:    vmovdqa %ymm15, %ymm14
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm11, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0,1],ymm11[2],mem[3],ymm11[4],mem[5,6],ymm11[7],mem[8,9],ymm11[10],mem[11],ymm11[12],mem[13,14],ymm11[15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,0,0,6,0,3,5]
@@ -3820,12 +3873,12 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
 ; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15]
+; AVX2-FCP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1],ymm14[2],mem[3],ymm14[4],mem[5,6],ymm14[7],mem[8,9],ymm14[10],mem[11],ymm14[12],mem[13,14],ymm14[15]
 ; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm2, %ymm2
 ; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
@@ -3849,338 +3902,345 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps %ymm2, (%r8)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm1, 32(%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r9)
-; AVX2-FCP-NEXT:    addq $296, %rsp # imm = 0x128
+; AVX2-FCP-NEXT:    addq $264, %rsp # imm = 0x108
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i16_stride5_vf32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm0
-; AVX512-NEXT:    vmovdqa 288(%rdi), %ymm1
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
+; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm1
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm4
-; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm3
-; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm9
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
-; AVX512-NEXT:    vmovdqa64 176(%rdi), %xmm20
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa 160(%rdi), %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm4
+; AVX512-NEXT:    vmovdqa 176(%rdi), %xmm2
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 160(%rdi), %xmm5
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm6
+; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm6[0],xmm7[1],xmm6[2,3]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm10
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm10
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm5
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm9
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm4
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm13
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %ymm11, %ymm12, %ymm15
-; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm11
-; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm12
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm13, %xmm7
-; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm7, %zmm16
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm14
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7]
+; AVX512-NEXT:    vpor %ymm11, %ymm12, %ymm11
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm10 ^ (zmm16 & (zmm11 ^ zmm10))
+; AVX512-NEXT:    vextracti64x4 $1, %zmm11, %ymm13
+; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm10
+; AVX512-NEXT:    vmovdqa 288(%rdi), %ymm12
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7],ymm12[8,9],ymm10[10],ymm12[11],ymm10[12],ymm12[13,14],ymm10[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm15, %xmm14
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm11, %zmm17
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
-; AVX512-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
-; AVX512-NEXT:    vpsrlq $48, %xmm20, %xmm15
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
-; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm15, %xmm0
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %ymm0, %ymm13, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3]
+; AVX512-NEXT:    vpshufb %ymm14, %ymm13, %ymm13
+; AVX512-NEXT:    vpsrlq $48, %xmm2, %xmm15
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm5[0,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3]
 ; AVX512-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm19
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm21
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm15
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm13
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15]
-; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm20
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm13
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %ymm13, %ymm14, %ymm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm11 ^ (zmm16 & (zmm13 ^ zmm11))
+; AVX512-NEXT:    vextracti64x4 $1, %zmm13, %ymm11
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15]
+; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm18
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm5[0,1,1,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm15 = xmm7[0],xmm6[1],xmm7[2,3]
+; AVX512-NEXT:    vpshufb %xmm14, %xmm15, %xmm14
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm11
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm15[2,3,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm11 ^ (zmm16 & (zmm13 ^ zmm11))
+; AVX512-NEXT:    vextracti64x4 $1, %zmm13, %ymm11
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13],ymm10[14],ymm12[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm16
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm22
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3]
-; AVX512-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
+; AVX512-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm15 = xmm2[0],xmm5[1],xmm2[2,3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm15 = xmm7[0,1],xmm6[2],xmm7[3]
+; AVX512-NEXT:    vpshufb %xmm13, %xmm15, %xmm13
 ; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm13, %zmm0
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15]
-; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
+; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm15
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm15[2,3,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
-; AVX512-NEXT:    vmovdqa %ymm2, %ymm9
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm11 ^ (mem & (zmm13 ^ zmm11))
+; AVX512-NEXT:    vextracti64x4 $1, %zmm13, %ymm11
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13],ymm12[14],ymm10[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm13, %ymm14
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm13, %zmm17
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3]
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm2
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
-; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm11[0],ymm14[1,2,3,4,5,6,7],ymm11[8],ymm14[9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm5[2],xmm2[3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX512-NEXT:    movb $7, %al
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa64 %zmm16, (%rsi)
-; AVX512-NEXT:    vmovdqa64 %zmm19, (%rdx)
-; AVX512-NEXT:    vmovdqa64 %zmm7, (%rcx)
-; AVX512-NEXT:    vmovdqa64 %zmm17, (%r8)
+; AVX512-NEXT:    vmovdqa64 %zmm17, (%rsi)
+; AVX512-NEXT:    vmovdqa64 %zmm18, (%rdx)
+; AVX512-NEXT:    vmovdqa64 %zmm16, (%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm11, (%r8)
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%r9)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride5_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa 176(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX512-FCP-NEXT:    vmovdqa 176(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
+; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,3,1,0,0,3,5,0]
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm20
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpandnq %zmm5, %zmm13, %zmm7
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm10
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm11
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm7
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %ymm7, %ymm12, %ymm12
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0]
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm13, %zmm16
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm12
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,0,0,0,4,6,1,3]
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1,2,3],xmm6[4,5],xmm12[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm6, %ymm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = (zmm12 & zmm13) | zmm7
+; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [8,9,10,11,12,21,22,23]
+; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm18, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm12, %zmm16
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,3],xmm7[4,5,6],xmm12[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
-; AVX512-FCP-NEXT:    vpermd %ymm13, %ymm17, %ymm13
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpor %ymm13, %ymm12, %ymm12
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm15
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12))
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
-; AVX512-FCP-NEXT:    vpsrlq $48, %xmm2, %xmm12
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm17, %ymm14
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpor %ymm7, %ymm14, %ymm14
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm7
+; AVX512-FCP-NEXT:    vpsrlq $48, %xmm0, %xmm12
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0]
 ; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm12
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,3,2,0,1,3,6,0]
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm7, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm21
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm15, %zmm12
+; AVX512-FCP-NEXT:    vpandnq %zmm12, %zmm13, %zmm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (zmm14 & zmm13) | zmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15]
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3]
 ; AVX512-FCP-NEXT:    # ymm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
+; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm18, %zmm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm14, %zmm19
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,2,0,0,5,7,2,4]
+; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,3,5,2,5,7,0,0]
 ; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm12
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm15, %zmm17
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm19, %ymm12
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4]
-; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm19, %ymm15
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0]
 ; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm14
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm15, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm15
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm15, %zmm12
+; AVX512-FCP-NEXT:    vpandnq %zmm12, %zmm13, %zmm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (zmm14 & zmm13) | zmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4],xmm15[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm18, %zmm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm14, %zmm13
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [4,1,6,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm12, %ymm12
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7]
-; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm15, %ymm12
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm18, %ymm15
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm15, %ymm14
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [1,3,6,0,5,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0]
 ; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6))
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm12, %zmm6
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm15, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm7
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm7, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm12))
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm7, %ymm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm12[0],ymm14[1,2,3,4,5,6,7],ymm12[8],ymm14[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm7
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7]
@@ -4190,366 +4250,373 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    movb $7, %al
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride5_vf32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm0
-; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
+; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm3
-; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm9
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vmovdqa64 176(%rdi), %xmm20
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm4
+; AVX512DQ-NEXT:    vmovdqa 176(%rdi), %xmm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %xmm5
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm6
+; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm6[0],xmm7[1],xmm6[2,3]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm10
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm10
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm5
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm9
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm13
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %ymm11, %ymm12, %ymm15
-; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm11
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm12
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm13, %xmm7
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm7, %zmm16
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm14
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT:    vpor %ymm11, %ymm12, %ymm11
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm10 ^ (zmm16 & (zmm11 ^ zmm10))
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm11, %ymm13
+; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm10
+; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm12
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7],ymm12[8,9],ymm10[10],ymm12[11],ymm10[12],ymm12[13,14],ymm10[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm14
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm11, %zmm17
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
-; AVX512DQ-NEXT:    vpsrlq $48, %xmm20, %xmm15
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm13, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3]
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm13, %ymm13
+; AVX512DQ-NEXT:    vpsrlq $48, %xmm2, %xmm15
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm5[0,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3]
 ; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm19
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm7
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm21
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm15
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm13
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm9, %ymm20
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm13
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %ymm13, %ymm14, %ymm13
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm11 ^ (zmm16 & (zmm13 ^ zmm11))
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm13, %ymm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm18
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm5[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm15 = xmm7[0],xmm6[1],xmm7[2,3]
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm15, %xmm14
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm15[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm11 ^ (zmm16 & (zmm13 ^ zmm11))
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm13, %ymm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13],ymm10[14],ymm12[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm16
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm22
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3]
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm15 = xmm2[0],xmm5[1],xmm2[2,3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm15 = xmm7[0,1],xmm6[2],xmm7[3]
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm15, %xmm13
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm13, %zmm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm15
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm15[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm11 ^ (mem & (zmm13 ^ zmm11))
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm13, %ymm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13],ymm12[14],ymm10[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm13, %ymm14
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm13, %zmm17
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm2
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm11[0],ymm14[1,2,3,4,5,6,7],ymm11[8],ymm14[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm5[2],xmm2[3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX512DQ-NEXT:    movb $7, %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm19, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%r8)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rsi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm18, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%r9)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa 176(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 176(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,3,1,0,0,3,5,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm20
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpandnq %zmm5, %zmm13, %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm7, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm13, %zmm16
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,0,0,0,4,6,1,3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1,2,3],xmm6[4,5],xmm12[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm6, %ymm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = (zmm12 & zmm13) | zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [8,9,10,11,12,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm18, %zmm7
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm12, %zmm16
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,3],xmm7[4,5,6],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm13, %ymm17, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %ymm13, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12))
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm2, %xmm12
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm17, %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %ymm7, %ymm14, %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm7
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm0, %xmm12
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm12
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,3,2,0,1,3,6,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm7, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm21
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm15, %zmm12
+; AVX512DQ-FCP-NEXT:    vpandnq %zmm12, %zmm13, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (zmm14 & zmm13) | zmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3]
 ; AVX512DQ-FCP-NEXT:    # ymm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm18, %zmm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm14, %zmm19
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,2,0,0,5,7,2,4]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,3,5,2,5,7,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm15, %zmm17
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm19, %ymm12
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm19, %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0]
 ; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm15, %zmm12
+; AVX512DQ-FCP-NEXT:    vpandnq %zmm12, %zmm13, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = (zmm14 & zmm13) | zmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4],xmm15[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm18, %zmm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm14, %zmm13
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [4,1,6,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm12, %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm15, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm18, %ymm15
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm15, %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [1,3,6,0,5,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0]
 ; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6))
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm12, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm7
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm12))
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm7, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm12[0],ymm14[1,2,3,4,5,6,7],ymm12[8],ymm14[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm7
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7]
@@ -4559,34 +4626,34 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    movb $7, %al
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -5863,13 +5930,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa 256(%rdi), %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
+; AVX-NEXT:    vmovdqa 256(%rdi), %xmm14
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm14[0,1,1,3]
+; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
-; AVX-NEXT:    vmovdqa 272(%rdi), %xmm15
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1]
-; AVX-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 272(%rdi), %xmm13
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm13[1]
+; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 240(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
@@ -5883,7 +5950,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
 ; AVX-NEXT:    vmovdqa 176(%rdi), %xmm3
-; AVX-NEXT:    vmovdqa %xmm3, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa 160(%rdi), %xmm4
@@ -5909,44 +5976,44 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1]
 ; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 560(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa 624(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 608(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
+; AVX-NEXT:    vmovdqa 608(%rdi), %xmm9
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3],xmm9[4,5,6,7]
+; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
 ; AVX-NEXT:    vmovdqa 496(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa 480(%rdi), %xmm9
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
-; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 480(%rdi), %xmm10
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm10[0,2,2,3]
+; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX-NEXT:    vmovdqa 528(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
-; AVX-NEXT:    vmovdqa 512(%rdi), %xmm13
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7]
-; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 512(%rdi), %xmm7
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5,6,7]
+; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
 ; AVX-NEXT:    vandps %ymm5, %ymm3, %ymm3
-; AVX-NEXT:    vmovaps 544(%rdi), %xmm11
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm11[0,1,0,1]
-; AVX-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 544(%rdi), %xmm8
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1]
+; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vandnps %ymm4, %ymm5, %ymm4
 ; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 96(%rdi), %xmm10
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3]
-; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 96(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
 ; AVX-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -5956,20 +6023,20 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa 144(%rdi), %xmm7
-; AVX-NEXT:    vmovdqa 128(%rdi), %xmm6
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7]
-; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 144(%rdi), %xmm3
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 128(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
+; AVX-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
+; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm0
@@ -5987,11 +6054,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 464(%rdi), %xmm8
+; AVX-NEXT:    vmovdqa 464(%rdi), %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 448(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vmovdqa 416(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -6025,13 +6092,12 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vandps %ymm5, %ymm2, %ymm2
 ; AVX-NEXT:    vmovaps 384(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1]
-; AVX-NEXT:    vandnps %ymm3, %ymm5, %ymm3
-; AVX-NEXT:    vorps %ymm3, %ymm2, %ymm2
+; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1]
+; AVX-NEXT:    vandnps %ymm15, %ymm5, %ymm15
+; AVX-NEXT:    vorps %ymm2, %ymm15, %ymm2
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
@@ -6039,99 +6105,101 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
+; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm15, %xmm15
+; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4],xmm15[5,6,7]
 ; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[0,3,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm15
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
+; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm14
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm13[0,1],mem[2,3],xmm13[4,5],mem[6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6,7]
 ; AVX-NEXT:    vandps %ymm5, %ymm0, %ymm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpsllq $48, %xmm4, %xmm15
-; AVX-NEXT:    vandnps %ymm15, %ymm5, %ymm15
-; AVX-NEXT:    vorps %ymm0, %ymm15, %ymm0
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vpsllq $48, %xmm3, %xmm14
+; AVX-NEXT:    vandnps %ymm14, %ymm5, %ymm14
+; AVX-NEXT:    vorps %ymm0, %ymm14, %ymm0
+; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,3,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,1,2,3],xmm14[4,5],mem[6,7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm15
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1]
+; AVX-NEXT:    vpshufd $236, (%rsp), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm9[0,1,2,3],mem[4,5],xmm9[6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm14[5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm10[0,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vpsrlq $48, %xmm12, %xmm15
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm9[2,3],xmm13[4,5],xmm9[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2,3],xmm7[4,5],xmm9[6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7]
-; AVX-NEXT:    vandps %ymm5, %ymm3, %ymm3
-; AVX-NEXT:    vpsllq $48, %xmm11, %xmm15
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
+; AVX-NEXT:    vandps %ymm5, %ymm14, %ymm14
+; AVX-NEXT:    vpsllq $48, %xmm8, %xmm15
 ; AVX-NEXT:    vandnps %ymm15, %ymm5, %ymm15
-; AVX-NEXT:    vorps %ymm3, %ymm15, %ymm3
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX-NEXT:    vorps %ymm15, %ymm14, %ymm14
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm14, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[0,3,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT:    vpsrlq $48, %xmm10, %xmm15
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm8[0,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm14[5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm6[0,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vpsrlq $48, %xmm7, %xmm15
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7]
-; AVX-NEXT:    vandps %ymm5, %ymm3, %ymm3
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpsllq $48, %xmm7, %xmm15
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
+; AVX-NEXT:    vandps %ymm5, %ymm14, %ymm14
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT:    vpsllq $48, %xmm6, %xmm15
 ; AVX-NEXT:    vandnps %ymm15, %ymm5, %ymm15
-; AVX-NEXT:    vorps %ymm3, %ymm15, %ymm3
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX-NEXT:    vorps %ymm15, %ymm14, %ymm14
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm14, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1,2,3],xmm8[4,5],mem[6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
 ; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = mem[0,1],xmm8[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,0,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,3,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
@@ -6140,9 +6208,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm2 = mem[0,3,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vpsrlq $48, %xmm3, %xmm3
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT:    vpsrlq $48, %xmm14, %xmm14
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7]
 ; AVX-NEXT:    vandps %ymm5, %ymm1, %ymm1
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -6161,98 +6229,98 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
+; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[0,1],xmm1[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
+; AVX-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
 ; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = mem[0,1,1,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
-; AVX-NEXT:    vpunpckhdq (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5],xmm15[6,7]
 ; AVX-NEXT:    vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = mem[0,1,2,0]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7]
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
-; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[0,1],xmm9[2,3],mem[4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm14, %ymm5
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = mem[0,1,2,3],xmm5[4,5],mem[6,7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX-NEXT:    vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[3,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,1],xmm13[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm14[5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0,1],xmm9[2,3],xmm13[4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
 ; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = mem[0,1,1,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5],xmm15[6,7]
 ; AVX-NEXT:    vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = mem[0,1,2,0]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7]
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm11[0,1,1,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm14, %ymm5
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm8[3,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,1],xmm8[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm14[5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
+; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = mem[0,1,1,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm7[0,1,2,0]
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5],xmm15[6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm6[0,1,2,0]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm14, %ymm3
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[0,1,1,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,0]
@@ -6268,227 +6336,232 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm2
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7]
+; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4],xmm5[5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
+; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = xmm15[0,1,2,3],mem[4,5],xmm15[6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
 ; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = mem[0,1,0,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm14, %ymm5
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
+; AVX-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15]
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
+; AVX-NEXT:    vmovdqa (%rsp), %xmm14 # 16-byte Reload
+; AVX-NEXT:    vpsrlq $48, %xmm14, %xmm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm14[5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
-; AVX-NEXT:    vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7]
+; AVX-NEXT:    vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,1],xmm0[2,3],mem[4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm15 = mem[0,1,2,3],xmm14[4,5],mem[6,7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = xmm15[0,1,2,3],mem[4,5],xmm15[6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
+; AVX-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = mem[0,1,0,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,1],xmm7[2,3],mem[4,5],xmm7[6,7]
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX-NEXT:    vpsrlq $48, %xmm8, %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX-NEXT:    vmovdqa %xmm0, %xmm4
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm14, %ymm5
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7]
+; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
+; AVX-NEXT:    vpsrlq $48, %xmm8, %xmm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm14[5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
+; AVX-NEXT:    vmovdqa %xmm0, %xmm6
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm14, %ymm3
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7]
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpsrlq $48, %xmm7, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = mem[0,1,2,3],xmm9[4,5],mem[6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vpsrlq $48, %xmm8, %xmm5
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm13[2,3],xmm10[4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm6, %xmm2, %xmm1
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm11[0,1,2,3],xmm6[4,5],xmm11[6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,3]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[0,3,2,3]
-; AVX-NEXT:    vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = xmm1[0,1,2],mem[3],xmm1[4,5,6,7]
-; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = mem[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7]
-; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = mem[1,1,1,1]
+; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[0,3,2,3]
+; AVX-NEXT:    vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1,2],mem[3],xmm0[4,5,6,7]
+; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
+; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[1,1,1,1]
 ; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm5 = mem[0,2,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3,4,5],xmm2[6,7]
 ; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm5 = mem[0,1,1,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[0,3,2,3]
-; AVX-NEXT:    vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7]
-; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm15 = mem[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7]
-; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm15 = mem[1,1,1,1]
-; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = mem[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1]
-; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5],xmm5[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm14[0,1,1,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
-; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = mem[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm10[0,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm5[4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm4[1,1,1,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
-; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5],xmm12[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm9[0,1,1,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm12, %ymm12
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm5
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
 ; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm5 = mem[3,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
-; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = mem[0,2,2,3]
+; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = mem[0,3,2,3]
+; AVX-NEXT:    vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm2[0,1,2],mem[3],xmm2[4,5,6,7]
+; AVX-NEXT:    vpshufd $238, (%rsp), %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = mem[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = mem[0,1,2,3],xmm5[4,5],mem[6,7]
+; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[1,1,1,1]
+; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = mem[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
+; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3,4,5],xmm5[6,7]
+; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,1,1,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm14, %ymm14
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm14[3,4,5,6,7]
+; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[3,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,1,4,5,6,7]
+; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = mem[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[0,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm8[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm10[0,1,2,3],xmm13[4,5],xmm10[6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm11[1,1,1,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm6[0,2,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
-; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = mem[0,3,2,3]
-; AVX-NEXT:    vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm11 = xmm13[0,1,2],mem[3],xmm13[4,5,6,7]
-; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm10 = mem[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3,4,5],xmm13[6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm7[0,1,1,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm10, %ymm10
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm12 = mem[3,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,1,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm9[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm10, %ymm10
+; AVX-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = mem[0,3,2,3]
+; AVX-NEXT:    vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7]
+; AVX-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
-; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
+; AVX-NEXT:    vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
+; AVX-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
 ; AVX-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm4 = mem[1,1,1,1]
-; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = mem[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7]
-; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = mem[0,1,1,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = mem[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4,5],xmm0[6,7]
+; AVX-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = mem[0,1,1,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = mem[3,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
+; AVX-NEXT:    vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm3, 64(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, (%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 96(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 32(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, (%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 96(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%rsi)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm3, 64(%rdx)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -6513,9 +6586,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %ymm3, 96(%r8)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm3, 32(%r8)
-; AVX-NEXT:    vmovaps %ymm2, 64(%r9)
-; AVX-NEXT:    vmovaps %ymm12, (%r9)
-; AVX-NEXT:    vmovaps %ymm0, 96(%r9)
+; AVX-NEXT:    vmovaps %ymm0, 64(%r9)
+; AVX-NEXT:    vmovaps %ymm10, (%r9)
+; AVX-NEXT:    vmovaps %ymm2, 96(%r9)
 ; AVX-NEXT:    vmovaps %ymm1, 32(%r9)
 ; AVX-NEXT:    addq $1032, %rsp # imm = 0x408
 ; AVX-NEXT:    vzeroupper
@@ -6523,978 +6596,972 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i16_stride5_vf64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $1048, %rsp # imm = 0x418
-; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm14
-; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm11
+; AVX2-NEXT:    subq $984, %rsp # imm = 0x3D8
+; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm11
+; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm14
+; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm15
+; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm8
 ; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm3
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm5
-; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4],ymm11[5],ymm8[6,7],ymm11[8],ymm8[9,10],ymm11[11],ymm8[12],ymm11[13],ymm8[14,15]
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
-; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %ymm4, %ymm8
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7]
-; AVX2-NEXT:    vmovdqa 416(%rdi), %ymm13
-; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm13[1,2],ymm10[3],ymm13[4],ymm10[5],ymm13[6,7],ymm10[8],ymm13[9,10],ymm10[11],ymm13[12],ymm10[13],ymm13[14,15]
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
-; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm15
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5],ymm4[6],ymm15[7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13],ymm4[14],ymm15[15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13],ymm2[14],ymm6[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
 ; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %ymm4, %ymm6
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm12
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm12[1,2],ymm9[3],ymm12[4],ymm9[5],ymm12[6,7],ymm9[8],ymm12[9,10],ymm9[11],ymm12[12],ymm9[13],ymm12[14,15]
-; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm5
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15]
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovdqa 416(%rdi), %ymm9
+; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm6, %ymm5
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm9[1,2],ymm11[3],ymm9[4],ymm11[5],ymm9[6,7],ymm11[8],ymm9[9,10],ymm11[11],ymm9[12],ymm11[13],ymm9[14,15]
+; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm10
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm12
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm12[1,2],ymm7[3],ymm12[4],ymm7[5],ymm12[6,7],ymm7[8],ymm12[9,10],ymm7[11],ymm12[12],ymm7[13],ymm12[14,15]
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm13
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15]
+; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2],mem[3],ymm1[4,5],mem[6],ymm1[7,8],mem[9],ymm1[10],mem[11],ymm1[12,13],mem[14],ymm1[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
+; AVX2-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm8
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6],xmm8[7]
-; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm11
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm10
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7]
-; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
-; AVX2-NEXT:    vpblendvb %ymm3, %ymm8, %ymm1, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13],ymm9[14],ymm12[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7]
-; AVX2-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm10
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7]
-; AVX2-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
-; AVX2-NEXT:    vpblendvb %ymm3, %ymm4, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm9
-; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm3
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm3[0],xmm9[1],xmm3[2,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6],xmm4[7]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm15
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm14
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm8[0],ymm10[1],ymm8[2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10],ymm10[11],ymm8[12,13],ymm10[14],ymm8[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm8
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3],xmm4[4,5,6],xmm8[7]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm4, %ymm1, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0],ymm7[1],ymm12[2,3],ymm7[4],ymm12[5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10,11],ymm7[12],ymm12[13],ymm7[14],ymm12[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5],ymm4[6,7]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm8
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3],xmm4[4,5,6],xmm8[7]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
+; AVX2-NEXT:    vpblendvb %ymm14, %ymm0, %ymm3, %ymm1
+; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm11
+; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm10
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm11[0],xmm10[1],xmm11[2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 608(%rdi), %xmm9
+; AVX2-NEXT:    vmovdqa 624(%rdi), %xmm7
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm9[0],xmm7[1],xmm9[2,3]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
+; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 448(%rdi), %xmm6
+; AVX2-NEXT:    vmovdqa 464(%rdi), %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm6[0],xmm3[1],xmm6[2,3]
+; AVX2-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovdqa %xmm3, %xmm4
+; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm8, %xmm8
 ; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 624(%rdi), %xmm15
-; AVX2-NEXT:    vmovdqa 608(%rdi), %xmm12
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0],xmm15[1],xmm12[2,3]
-; AVX2-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
-; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7],ymm4[8,9,10,11,12],ymm10[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 464(%rdi), %xmm10
-; AVX2-NEXT:    vmovdqa 448(%rdi), %xmm8
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm13 = xmm8[0],xmm10[1],xmm8[2,3]
-; AVX2-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufb %xmm0, %xmm13, %xmm13
-; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm13 = ymm4[0,1,2,3,4],ymm13[5,6,7],ymm4[8,9,10,11,12],ymm13[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm5
-; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm4
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm14 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7],ymm3[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm5
+; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm14 = xmm5[0],xmm3[1],xmm5[2,3]
 ; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %xmm3, %xmm14
 ; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm9[2],xmm3[3]
-; AVX2-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm15[2],xmm12[3]
-; AVX2-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm10, %xmm8
+; AVX2-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovdqa %xmm11, %xmm13
+; AVX2-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm10[2],xmm11[3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3]
-; AVX2-NEXT:    vmovdqa %xmm8, %xmm10
-; AVX2-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm7[2],xmm9[3]
+; AVX2-NEXT:    vmovdqa %xmm9, %xmm14
+; AVX2-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovdqa %xmm7, %xmm12
+; AVX2-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7],ymm15[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm4[2],xmm6[3]
+; AVX2-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $82, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm3[2],xmm5[3]
+; AVX2-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11],ymm6[12],ymm10[13],ymm6[14],ymm10[15]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15]
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm5
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm9[0],xmm14[1],xmm9[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0],xmm13[1],xmm8[2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm6 = mem[0],xmm12[1],mem[2,3]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6],ymm5[7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm7 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7],ymm9[8,9],mem[10],ymm9[11],mem[12],ymm9[13,14],mem[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm11
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm5
+; AVX2-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm7 = xmm12[0],xmm14[1],xmm12[2,3]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6],ymm5[7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm6 = mem[0],xmm10[1],mem[2,3]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm11
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm5
+; AVX2-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm7 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm7 = xmm12[0],mem[1],xmm12[2,3]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3,4],xmm3[5,6,7]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-NEXT:    vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm2 = mem[0],xmm2[1],mem[2,3]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6],ymm5[7]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm1 = xmm5[0],mem[1],xmm5[2,3]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendw $181, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm10[1],ymm6[2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-NEXT:    vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm10[0],mem[1,2],ymm10[3],mem[4],ymm10[5],mem[6,7],ymm10[8],mem[9,10],ymm10[11],mem[12],ymm10[13],mem[14,15]
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX2-NEXT:    vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm6 = mem[0,1],xmm15[2],mem[3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
-; AVX2-NEXT:    vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = mem[0],ymm11[1,2],mem[3],ymm11[4],mem[5],ymm11[6,7],mem[8],ymm11[9,10],mem[11],ymm11[12],mem[13],ymm11[14,15]
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX2-NEXT:    vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm6 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm6 = mem[0,1],xmm11[2],mem[3]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm14[2],xmm8[3]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15]
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm12[2],xmm10[3]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm1 = mem[0,1],xmm1[2],mem[3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm15[0],ymm9[1,2],ymm15[3],ymm9[4],ymm15[5],ymm9[6,7],ymm15[8],ymm9[9,10],ymm15[11],ymm9[12],ymm15[13],ymm9[14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm1 = mem[0,1],xmm1[2],mem[3]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa %ymm13, %ymm9
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm3 = xmm12[0,1],mem[2],xmm12[3]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $107, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
-; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm2 = mem[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm12[1],ymm0[2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10],ymm12[11],ymm0[12,13],ymm12[14],ymm0[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0],ymm11[1,2],ymm1[3],ymm11[4],ymm1[5],ymm11[6,7],ymm1[8],ymm11[9,10],ymm1[11],ymm11[12],ymm1[13],ymm11[14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm3 = xmm5[0,1],mem[2],xmm5[3]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX2-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5],ymm10[6],mem[7,8],ymm10[9],mem[10,11],ymm10[12],mem[13],ymm10[14],mem[15]
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm4 = mem[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX2-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm6
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
+; AVX2-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm7 = mem[3,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX2-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm6 = mem[0,1],ymm6[2],mem[3],ymm6[4],mem[5,6],ymm6[7],mem[8,9],ymm6[10],mem[11],ymm6[12],mem[13,14],ymm6[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7]
+; AVX2-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm7 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
+; AVX2-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
+; AVX2-NEXT:    vpshufb %xmm4, %xmm7, %xmm7
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 608(%rdi), %xmm7
+; AVX2-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
+; AVX2-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm8 = mem[3,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6],ymm8[7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7]
+; AVX2-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
+; AVX2-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 448(%rdi), %xmm8
+; AVX2-NEXT:    vpshufb %xmm5, %xmm8, %xmm8
+; AVX2-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm9 = mem[3,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1],ymm12[2],ymm0[3],ymm12[4],ymm0[5,6],ymm12[7],ymm0[8,9],ymm12[10],ymm0[11],ymm12[12],ymm0[13,14],ymm12[15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6],ymm9[7]
+; AVX2-NEXT:    vpshufb %ymm11, %ymm8, %ymm8
+; AVX2-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm9 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4],xmm10[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm9, %xmm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX2-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
+; AVX2-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm8 = mem[3,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
 ; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, 64(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, 96(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, 32(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, 64(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, 96(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, 32(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, 64(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, (%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, 96(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, 32(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, 64(%r8)
-; AVX2-NEXT:    vmovdqa %ymm1, (%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 96(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%r8)
-; AVX2-NEXT:    vmovdqa %ymm4, 64(%r9)
-; AVX2-NEXT:    vmovdqa %ymm3, (%r9)
-; AVX2-NEXT:    vmovdqa %ymm2, 96(%r9)
-; AVX2-NEXT:    vmovdqa %ymm0, 32(%r9)
-; AVX2-NEXT:    addq $1048, %rsp # imm = 0x418
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 64(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 96(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 32(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 64(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, (%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 96(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 32(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 64(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, (%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 96(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 32(%rcx)
+; AVX2-NEXT:    vmovdqa %ymm13, 64(%r8)
+; AVX2-NEXT:    vmovdqa %ymm2, (%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 96(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 32(%r8)
+; AVX2-NEXT:    vmovdqa %ymm7, 64(%r9)
+; AVX2-NEXT:    vmovdqa %ymm4, (%r9)
+; AVX2-NEXT:    vmovdqa %ymm6, 96(%r9)
+; AVX2-NEXT:    vmovdqa %ymm3, 32(%r9)
+; AVX2-NEXT:    addq $984, %rsp # imm = 0x3D8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride5_vf64:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $1080, %rsp # imm = 0x438
-; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm5
-; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    subq $984, %rsp # imm = 0x3D8
+; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm11
 ; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm15
-; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm15
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3],ymm0[4],ymm15[5],ymm0[6,7],ymm15[8],ymm0[9,10],ymm15[11],ymm0[12],ymm15[13],ymm0[14,15]
+; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0],ymm2[1],ymm15[2,3],ymm2[4],ymm15[5],ymm2[6],ymm15[7,8],ymm2[9],ymm15[10,11],ymm2[12],ymm15[13],ymm2[14],ymm15[15]
-; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm4
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm3, %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
-; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7]
-; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %ymm14
-; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm3, %ymm3
-; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm7
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15]
-; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7]
-; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm12
-; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm11
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13],ymm2[14],ymm6[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4],ymm11[5],ymm8[6,7],ymm11[8],ymm8[9,10],ymm11[11],ymm8[12],ymm11[13],ymm8[14,15]
 ; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm3, %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm10
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15]
+; AVX2-FP-NEXT:    vmovdqa %ymm5, %ymm7
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm5
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4,5],xmm5[6,7]
+; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %ymm12
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15]
 ; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm14
+; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm8
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10,11],ymm14[12],ymm8[13],ymm14[14],ymm8[15]
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm5
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4,5],xmm5[6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm9
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15]
 ; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6],ymm6[7]
-; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15]
 ; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm2
-; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm1, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
+; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6],xmm8[7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm11
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
-; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm1
-; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm8, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm11
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
-; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm6, %ymm7, %ymm12
-; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm1
-; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm9
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0],xmm1[1],xmm9[2,3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 624(%rdi), %xmm10
-; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %xmm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0],xmm10[1],xmm8[2,3]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 464(%rdi), %xmm6
-; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %xmm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm5[0],xmm6[1],xmm5[2,3]
-; AVX2-FP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm11
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm7[0],xmm11[1],xmm7[2,3]
-; AVX2-FP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa %xmm9, %xmm4
+; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0],ymm7[1],mem[2],ymm7[3],mem[4,5],ymm7[6],mem[7,8],ymm7[9],mem[10],ymm7[11],mem[12,13],ymm7[14],mem[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5,6],xmm7[7]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm0, %ymm11
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13],ymm10[14],ymm12[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm14[0],ymm8[1],ymm14[2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6],xmm8[7]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm7, %xmm7
+; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm7, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10,11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm7, %ymm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6],xmm8[7]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm1, %ymm4, %ymm7
+; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm10
+; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm10[0],xmm5[1],xmm10[2,3]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %xmm12
+; AVX2-FP-NEXT:    vmovdqa 624(%rdi), %xmm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0],xmm4[1],xmm12[2,3]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0,1,2,3,4],ymm6[5,6,7],ymm1[8,9,10,11,12],ymm6[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %xmm9
+; AVX2-FP-NEXT:    vmovdqa 464(%rdi), %xmm13
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm13[1],xmm9[2,3]
 ; AVX2-FP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vmovdqa %xmm1, %xmm14
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm8, %xmm8
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm8[0],xmm1[1],xmm8[2,3]
+; AVX2-FP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm1[2],xmm9[3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3]
-; AVX2-FP-NEXT:    vmovdqa %xmm10, %xmm13
+; AVX2-FP-NEXT:    vmovdqa %xmm5, %xmm6
+; AVX2-FP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vmovdqa %xmm10, %xmm14
 ; AVX2-FP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vmovdqa %xmm8, %xmm10
-; AVX2-FP-NEXT:    vmovdqa %xmm8, (%rsp) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm5[2],xmm10[3]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm4[2],xmm12[3]
+; AVX2-FP-NEXT:    vmovdqa %xmm12, %xmm5
+; AVX2-FP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3]
-; AVX2-FP-NEXT:    vmovdqa %xmm6, %xmm3
-; AVX2-FP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm13[2],xmm9[3]
+; AVX2-FP-NEXT:    vmovdqa %xmm13, (%rsp) # 16-byte Spill
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm1[2],xmm8[3]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5],ymm1[6],ymm12[7,8],ymm1[9],ymm12[10,11],ymm1[12],ymm12[13],ymm1[14],ymm12[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
+; AVX2-FP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm0, %ymm3
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm13[0],xmm10[1],xmm13[2,3]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm6[0],xmm14[1],xmm6[2,3]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5],ymm15[6],ymm8[7,8],ymm15[9],ymm8[10,11],ymm15[12],ymm8[13],ymm15[14],ymm8[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm6[2],ymm11[3],ymm6[4],ymm11[5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11],ymm6[12],ymm11[13,14],ymm6[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3,4],xmm7[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm7 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7],ymm9[8,9],mem[10],ymm9[11],mem[12],ymm9[13,14],mem[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3,4],xmm7[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm7 = xmm13[0],mem[1],xmm13[2,3]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm3[0],xmm13[1],xmm3[2,3]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4],xmm3[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FP-NEXT:    vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm2 = mem[0],xmm2[1],mem[2,3]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5],ymm10[6],mem[7,8],ymm10[9],mem[10,11],ymm10[12],mem[13],ymm10[14],mem[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7]
+; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7,8],ymm12[9],ymm1[10],ymm12[11],ymm1[12,13],ymm12[14],ymm1[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm13[0],mem[1,2],ymm13[3],mem[4],ymm13[5],mem[6,7],ymm13[8],mem[9,10],ymm13[11],mem[12],ymm13[13],mem[14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FP-NEXT:    vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm1 = mem[0,1],xmm1[2],mem[3]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0],ymm8[1],ymm15[2],ymm8[3],ymm15[4,5],ymm8[6],ymm15[7,8],ymm8[9],ymm15[10],ymm8[11],ymm15[12,13],ymm8[14],ymm15[15]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FP-NEXT:    vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm9 = xmm1[0,1],mem[2],xmm1[3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
-; AVX2-FP-NEXT:    vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = mem[0],ymm7[1,2],mem[3],ymm7[4],mem[5],ymm7[6,7],mem[8],ymm7[9,10],mem[11],ymm7[12],mem[13],ymm7[14,15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm12
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FP-NEXT:    vpblendd $4, (%rsp), %xmm4, %xmm9 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm9 = xmm4[0,1],mem[2],xmm4[3]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm12
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3]
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm13[2],xmm15[3]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10],ymm7[11],ymm12[12,13],ymm7[14],ymm12[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm9
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm5[2],xmm3[3]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-FP-NEXT:    vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm1 = xmm15[0,1],mem[2],xmm15[3]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
+; AVX2-FP-NEXT:    vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0],ymm9[1,2],mem[3],ymm9[4],mem[5],ymm9[6,7],mem[8],ymm9[9,10],mem[11],ymm9[12],mem[13],ymm9[14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm14 # 16-byte Reload
+; AVX2-FP-NEXT:    vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm3 = xmm14[0,1],mem[2],xmm14[3]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm11
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4],ymm1[5],ymm12[6,7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12],ymm1[13],ymm12[14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm6
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FP-NEXT:    vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm3 = xmm0[0,1],mem[2],xmm0[3]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5],ymm13[6],mem[7,8],ymm13[9],mem[10,11],ymm13[12],mem[13],ymm13[14],mem[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
+; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm6, %xmm7
+; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm8
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm7 = ymm7[0,1],mem[2],ymm7[3],mem[4],ymm7[5,6],mem[7],ymm7[8,9],mem[10],ymm7[11],mem[12],ymm7[13,14],mem[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6],ymm8[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm7, %ymm7
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm15, %xmm8
+; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %xmm9
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm9, %xmm9
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm7
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm9
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm9, %xmm9
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4],ymm9[5,6],ymm11[7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5],ymm1[6],ymm12[7,8],ymm1[9],ymm12[10,11],ymm1[12],ymm12[13],ymm1[14],ymm12[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm9, %ymm9
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm11, %xmm11
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
-; AVX2-FP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm1, %xmm4
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm14
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4],xmm14[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm8 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm3, %xmm4
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm5, %ymm5
-; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm7, %xmm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7]
-; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm7
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm5, %xmm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm15, %xmm3
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm0, 64(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm4, (%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm2, 96(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm6, 32(%r9)
-; AVX2-FP-NEXT:    addq $1080, %rsp # imm = 0x438
+; AVX2-FP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4],ymm9[5,6],ymm11[7]
+; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm9, %ymm9
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm10 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm11
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm14, %xmm5
+; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %xmm9
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, (%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, (%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%r8)
+; AVX2-FP-NEXT:    vmovdqa %ymm4, 64(%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm8, 96(%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm3, 32(%r9)
+; AVX2-FP-NEXT:    addq $984, %rsp # imm = 0x3D8
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride5_vf64:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $1000, %rsp # imm = 0x3E8
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm12
+; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
@@ -7502,395 +7569,396 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3]
 ; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
 ; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
-; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13],ymm13[14],ymm12[15]
+; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm11[1,2],ymm14[3],ymm11[4],ymm14[5],ymm11[6,7],ymm14[8],ymm11[9,10],ymm14[11],ymm11[12],ymm14[13],ymm11[14,15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
 ; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
-; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5],ymm15[6],ymm9[7,8],ymm15[9],ymm9[10,11],ymm15[12],ymm9[13],ymm15[14],ymm9[15]
-; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm8
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
+; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15]
-; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4],ymm7[5],ymm4[6,7],ymm7[8],ymm4[9,10],ymm7[11],ymm4[12],ymm7[13],ymm4[14,15]
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
 ; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
 ; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
-; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm14[0],ymm5[1],ymm14[2,3],ymm5[4],ymm14[5],ymm5[6],ymm14[7,8],ymm5[9],ymm14[10,11],ymm5[12],ymm14[13],ymm5[14],ymm14[15]
+; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vmovdqa 416(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15]
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vmovdqa 416(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm15[0],ymm6[1,2],ymm15[3],ymm6[4],ymm15[5],ymm6[6,7],ymm15[8],ymm6[9,10],ymm15[11],ymm6[12],ymm15[13],ymm6[14,15]
+; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm1, %ymm0
+; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpblendw $181, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
 ; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
+; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [2,0,0,0,4,7,1,6]
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm3
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
 ; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm3, %ymm6
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15]
+; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm0, %ymm3, %ymm0
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6],xmm3[7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15]
+; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = ymm11[0],mem[1],ymm11[2,3],mem[4],ymm11[5],mem[6],ymm11[7,8],mem[9],ymm11[10,11],mem[12],ymm11[13],mem[14],ymm11[15]
 ; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm2, %ymm3
 ; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15]
+; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm0, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm11
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5,6],xmm11[7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13],ymm8[14],ymm12[15]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
 ; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm2, %ymm11
 ; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm11
 ; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm11, %ymm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
+; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm0, %ymm11, %ymm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm5[0],ymm14[1],ymm5[2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10],ymm14[11],ymm5[12,13],ymm14[14],ymm5[15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6],xmm12[7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10,11],ymm15[12],ymm6[13],ymm15[14],ymm6[15]
 ; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm2, %ymm2
 ; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm4, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7]
-; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm5, %ymm11
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
-; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm5, %ymm11
-; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm5, %ymm11
-; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm4, %ymm1, %ymm5
+; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,1,3,0,3,5,7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm11, %ymm4
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm11, %ymm10
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm10
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
+; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm11, %ymm10
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm10
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm5, %ymm5
+; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm11, %ymm10
 ; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm7, %ymm11
-; AVX2-FCP-NEXT:    vmovdqa %ymm12, %ymm8
-; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
-; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm10
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm10
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm7, %ymm6
-; AVX2-FCP-NEXT:    vmovdqa %ymm10, %ymm12
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm8, %ymm11
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
+; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0,1,2,3,4],ymm11[5,6,7],ymm1[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm8, %ymm6
+; AVX2-FCP-NEXT:    vmovdqa %ymm13, %ymm7
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm6
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm7, %ymm3
-; AVX2-FCP-NEXT:    vmovdqa %ymm9, %ymm5
-; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm8, %ymm3
+; AVX2-FCP-NEXT:    vmovdqa %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm3
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm7, %ymm0
-; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm8, %ymm0
+; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm6
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,0,0,5,7,2,4]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm7
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0]
-; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm6, %ymm11
-; AVX2-FCP-NEXT:    vmovdqa %ymm14, %ymm8
-; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm5
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5],mem[6],ymm15[7,8],mem[9],ymm15[10,11],mem[12],ymm15[13],mem[14],ymm15[15]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,2,0,0,5,7,2,4]
+; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm3, %ymm8
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0]
+; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm8, %ymm11
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
+; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm11[5,6,7],ymm5[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7],ymm14[8,9],mem[10],ymm14[11],mem[12],ymm14[13,14],mem[15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15]
+; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm3, %ymm11
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm8, %ymm11
+; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm11[5,6,7],ymm5[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15]
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm11
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm6, %ymm11
-; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm13
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm12[2],ymm10[3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8,9],ymm12[10],ymm10[11],ymm12[12],ymm10[13,14],ymm12[15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10,11],ymm9[12],ymm1[13],ymm9[14],ymm1[15]
+; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm3, %ymm11
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm8, %ymm11
+; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm11[5,6,7],ymm5[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13],ymm8[14],ymm2[15]
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm11
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm11
-; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
-; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,3,0,0,5,0,2,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm9, %ymm6
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0]
-; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
-; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7],ymm9[8,9],mem[10],ymm9[11],mem[12],ymm9[13,14],mem[15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15]
+; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
-; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm9, %ymm7
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,1,6,3]
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4],mem[5],ymm0[6,7],mem[8],ymm0[9,10],mem[11],ymm0[12],mem[13],ymm0[14,15]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm5
+; AVX2-FCP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7]
+; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm15, %ymm8
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,4,7,0,2,4,7,0]
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
+; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm8[0,1,2,3,4],ymm11[5,6,7],ymm8[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = mem[0],ymm14[1,2],mem[3],ymm14[4],mem[5],ymm14[6,7],mem[8],ymm14[9,10],mem[11],ymm14[12],mem[13],ymm14[14,15]
+; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm2, %ymm8
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
+; AVX2-FCP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15]
+; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm15, %ymm11
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vmovdqa %ymm0, %ymm13
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm8[0,1,2,3,4],ymm11[5,6,7],ymm8[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm12[0],mem[1,2],ymm12[3],mem[4],ymm12[5],mem[6,7],ymm12[8],mem[9,10],ymm12[11],mem[12],ymm12[13],mem[14,15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm12
-; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm9, %ymm7
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm6, %ymm7
-; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm1[1],ymm14[2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8],ymm1[9],ymm14[10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15]
+; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm15, %ymm8
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm10[1,2],ymm12[3],ymm10[4],ymm12[5],ymm10[6,7],ymm12[8],ymm10[9,10],ymm12[11],ymm10[12],ymm12[13],ymm10[14,15]
+; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm2, %ymm11
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm11, %xmm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vmovdqa %ymm13, %ymm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm8[0,1,2,3,4],ymm11[5,6,7],ymm8[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm9, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm6, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4],ymm13[5],ymm9[6,7],ymm13[8],ymm9[9,10],ymm13[11],ymm9[12],ymm13[13],ymm9[14,15]
+; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10],ymm4[11],ymm7[12,13],ymm4[14],ymm7[15]
+; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm15, %ymm0
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,0,0,6,0,3,5]
 ; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm2
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
-; AVX2-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
+; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4],xmm11[5,6,7]
-; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,2,1,3,0,2,5,7]
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
+; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4],xmm11[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15]
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm5, %ymm11
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm9
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
+; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7],ymm14[8,9],mem[10],ymm14[11],mem[12],ymm14[13,14],mem[15]
+; AVX2-FCP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm5, %ymm9
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
+; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm5, %ymm4
+; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm4
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm0, %ymm8
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4],xmm11[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
-; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15]
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm5, %ymm5
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5],ymm13[6],mem[7,8],ymm13[9],mem[10,11],ymm13[12],mem[13],ymm13[14],mem[15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3,4],xmm11[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm7[0,1],mem[2],ymm7[3],mem[4],ymm7[5,6],mem[7],ymm7[8,9],mem[10],ymm7[11],mem[12],ymm7[13,14],mem[15]
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm5, %ymm5
+; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm5
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm10, %ymm6
+; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rsi)
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, 64(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, 64(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm3, (%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 96(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%r8)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, 64(%r9)
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, 96(%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm1, 96(%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%r9)
 ; AVX2-FCP-NEXT:    addq $1000, %rsp # imm = 0x3E8
 ; AVX2-FCP-NEXT:    vzeroupper
@@ -7898,35 +7966,35 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-LABEL: load_i16_stride5_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $552, %rsp # imm = 0x228
-; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm6
-; AVX512-NEXT:    vmovdqa 416(%rdi), %ymm11
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
-; AVX512-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    subq $344, %rsp # imm = 0x158
+; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX512-NEXT:    vmovdqa 416(%rdi), %ymm8
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4],ymm9[5],ymm8[6,7],ymm9[8],ymm8[9,10],ymm9[11],ymm8[12],ymm9[13],ymm8[14,15]
+; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm8
-; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm7
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15]
-; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm11
+; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm10
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
+; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX512-NEXT:    vporq %ymm2, %ymm3, %ymm19
-; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm15
-; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX512-NEXT:    vmovdqa 176(%rdi), %xmm12
-; AVX512-NEXT:    vmovdqa 160(%rdi), %xmm14
+; AVX512-NEXT:    vporq %ymm2, %ymm3, %ymm24
+; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm13
+; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm12
+; AVX512-NEXT:    vmovdqa 176(%rdi), %xmm14
+; AVX512-NEXT:    vmovdqa64 160(%rdi), %xmm16
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm10
-; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm7
+; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm6
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
@@ -7934,64 +8002,100 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+; AVX512-NEXT:    vporq %ymm0, %ymm1, %ymm23
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
 ; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15]
-; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqu %ymm9, (%rsp) # 32-byte Spill
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm15
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vporq %ymm0, %ymm1, %ymm28
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15]
-; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm18
-; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm24
+; AVX512-NEXT:    vporq %ymm0, %ymm1, %ymm21
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm8
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm16[0,1,1,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm10
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm10[0],xmm15[1],xmm10[2,3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
+; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX512-NEXT:    vmovdqa 288(%rdi), %ymm9
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm29 & (zmm1 ^ zmm0))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15]
+; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm22
+; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm19
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm16
-; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm30
-; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0],xmm8[1],xmm14[2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm27
+; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm20
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa %ymm2, %ymm8
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm11
-; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm7
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm15[2],xmm10[3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
-; AVX512-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX512-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm12
-; AVX512-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
+; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm28
+; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm16
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
@@ -8002,853 +8106,754 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 464(%rdi), %xmm8
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512-NEXT:    vmovdqa 448(%rdi), %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa %xmm11, %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512-NEXT:    vmovdqa %xmm7, %xmm9
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm8[1],xmm3[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm22
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm23
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm10
-; AVX512-NEXT:    vmovdqa 576(%rdi), %ymm1
-; AVX512-NEXT:    vmovdqa 608(%rdi), %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm17
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
-; AVX512-NEXT:    vmovdqa 512(%rdi), %ymm5
-; AVX512-NEXT:    vmovdqa 544(%rdi), %ymm13
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10,11],ymm5[12],ymm13[13],ymm5[14],ymm13[15]
+; AVX512-NEXT:    vmovdqa 464(%rdi), %xmm12
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,1,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 448(%rdi), %xmm9
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufb %xmm0, %xmm15, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm15, %xmm7
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm31 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT:    vmovdqa 512(%rdi), %ymm6
+; AVX512-NEXT:    vmovdqa 544(%rdi), %ymm5
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 496(%rdi), %xmm21
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa 480(%rdi), %xmm7
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
-; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
-; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm19))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm10, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm10
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm1
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm16[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm24
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm30[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0],xmm6[1],xmm9[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm25
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm4
-; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15]
-; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm26
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm22
-; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm30
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpsrlq $48, %xmm21, %xmm13
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm10, %ymm10
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm9
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm7
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
-; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm14
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7]
-; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm17
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm24[0,3,2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm20
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm1
-; AVX512-NEXT:    vpsrlq $48, %xmm16, %xmm11
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm19
-; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
-; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm5
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm6[2],xmm5[3]
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm23
-; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 496(%rdi), %xmm18
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 480(%rdi), %xmm17
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm17[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm9[0],xmm12[1],xmm9[2,3]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm26
+; AVX512-NEXT:    vmovdqa 576(%rdi), %ymm14
+; AVX512-NEXT:    vmovdqa 608(%rdi), %ymm13
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm11
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3,4],xmm0[5,6,7]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm29 & (zmm26 ^ zmm24))
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm28))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vextracti64x4 $1, %zmm26, %ymm15
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm26, %zmm30
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm4
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm15
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm26
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm27[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0],xmm10[1],xmm7[2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm22
+; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm7
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm29 & (zmm23 ^ zmm0))
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm23, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm23, %zmm28
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm17[0,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpsrlq $48, %xmm18, %xmm8
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm12[2],xmm9[3]
+; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm16
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4],ymm14[5],ymm13[6,7],ymm14[8],ymm13[9,10],ymm14[11],ymm13[12],ymm14[13],ymm13[14,15]
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm15
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2],xmm7[3]
+; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ mem))
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
+; AVX512-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm12
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm25
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm27[0,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpsrlq $48, %xmm26, %xmm8
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm1[0,1],xmm10[2],xmm1[3]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm22
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3]
+; AVX512-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm0 ^ (zmm29 & (zmm21 ^ zmm0))
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm21, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm21, %zmm24
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6],ymm0[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10,11],ymm12[12],ymm0[13],ymm12[14],ymm0[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm5
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm10
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm8[0],xmm4[1],xmm8[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm29
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm28
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm27 & (zmm13 ^ zmm2))
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm8
-; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm4
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm14
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
-; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm30[0,1,1,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm5
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm21[2],xmm11[3],xmm21[3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm2[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm21
-; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm10
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
-; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm13, %zmm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm26
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm20[0,1,1,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm17
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm1
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15]
-; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm31
-; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm13
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
-; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT:    vpblendw $82, (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm10 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm0))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm23
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm13
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4],ymm0[5],ymm10[6],ymm0[7]
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm14
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm15
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
-; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm17[0,1,1,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm3
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm18[2],xmm8[3],xmm18[3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm7[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm2[0],xmm9[1],xmm2[2,3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm0))
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm8
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm21
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15]
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqa %ymm8, %ymm4
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm3[2],ymm8[3],ymm3[4],ymm8[5,6],ymm3[7],ymm8[8,9],ymm3[10],ymm8[11],ymm3[12],ymm8[13,14],ymm3[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm5[0],xmm6[1],xmm5[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm7
-; AVX512-NEXT:    vmovdqa %xmm5, %xmm6
-; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2],xmm2[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm3[0],xmm4[1],xmm3[2,3]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm7[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm9[2],xmm2[3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm0))
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm10
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm9
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3],xmm0[4,5],xmm9[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm0))
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm8
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1,2,3],xmm0[4,5],xmm8[6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm25
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5,6],ymm8[7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm11, %xmm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
-; AVX512-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm4, %zmm1
+; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm8
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4],ymm0[5,6],ymm7[7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm7[3,4],xmm9[5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX512-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
+; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm4[2],xmm3[3]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm5, %ymm5
+; AVX512-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
 ; AVX512-NEXT:    movb $7, %al
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6],xmm5[7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm5
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm4
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15]
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT:    vpblendw $107, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm5 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1}
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm4
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vinserti32x4 $1, %xmm31, %ymm0, %ymm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm3
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
+; AVX512-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm4
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
-; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vmovaps %zmm2, (%rsi)
-; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vmovaps %zmm2, 64(%rsi)
-; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vmovaps %zmm2, 64(%rdx)
-; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vmovaps %zmm2, (%rdx)
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm28, (%rsi)
+; AVX512-NEXT:    vmovdqa64 %zmm30, 64(%rsi)
+; AVX512-NEXT:    vmovdqa64 %zmm25, 64(%rdx)
+; AVX512-NEXT:    vmovdqa64 %zmm24, (%rdx)
+; AVX512-NEXT:    vmovdqa64 %zmm21, 64(%rcx)
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vmovaps %zmm2, 64(%rcx)
-; AVX512-NEXT:    vmovdqa64 %zmm23, (%rcx)
-; AVX512-NEXT:    vmovdqa64 %zmm25, 64(%r8)
+; AVX512-NEXT:    vmovaps %zmm2, (%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm8, 64(%r8)
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vmovaps %zmm2, (%r8)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, 64(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%r9)
-; AVX512-NEXT:    addq $552, %rsp # imm = 0x228
+; AVX512-NEXT:    addq $344, %rsp # imm = 0x158
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride5_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $552, %rsp # imm = 0x228
+; AVX512-FCP-NEXT:    subq $648, %rsp # imm = 0x288
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa 496(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa 480(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 544(%rdi), %ymm11
-; AVX512-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 608(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vmovdqu %ymm8, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm28
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm30
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vmovdqa 416(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm26
-; AVX512-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm27
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3]
-; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vpor %ymm5, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm10, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm7
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6]
-; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm17, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7]
-; AVX512-FCP-NEXT:    vmovdqa 448(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm18, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 176(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm31
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %xmm15
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm15, %xmm3
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm10, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm25
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
-; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm17, %ymm3
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm18, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm21
-; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm29
-; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm19
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm22
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm17, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vmovdqa 496(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa 480(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm29
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 544(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm17
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm27
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [2,4,7,1,4,6,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm20, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [8,9,3,2,4,5,7,6]
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm14, %ymm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,3,1,3,0,3,5,7]
+; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm21
+; AVX512-FCP-NEXT:    vpermd %ymm21, %ymm15, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
 ; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm10
-; AVX512-FCP-NEXT:    vpsrlq $48, %xmm31, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15]
-; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm5
-; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm20, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3]
-; AVX512-FCP-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm10
+; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm12
-; AVX512-FCP-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm24, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm25, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm7
+; AVX512-FCP-NEXT:    vmovdqa 416(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
+; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,0,2,4,6,1,3]
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm16, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm11, %ymm11
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm11, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 176(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm18
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %xmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm13
+; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10,11],ymm13[12],ymm11[13],ymm13[14],ymm11[15]
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm20, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm14, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm20
+; AVX512-FCP-NEXT:    vpermd %ymm20, %ymm15, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm30
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm18
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm28
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15]
-; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpor %ymm4, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15]
-; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm20, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpsrlq $48, %xmm27, %xmm4
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm24, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm25, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm14[1,2],ymm6[3],ymm14[4],ymm6[5],ymm14[6,7],ymm6[8],ymm14[9,10],ymm6[11],ymm14[12],ymm6[13],ymm14[14,15]
+; AVX512-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm16, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15]
+; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [2,0,0,0,4,7,1,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15]
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm16, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vporq %ymm4, %ymm2, %ymm22
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm6
+; AVX512-FCP-NEXT:    vpsrlq $48, %xmm18, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm4
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [0,2,5,7,4,7,0,0]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13],ymm11[14],ymm13[15]
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm25, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [1,3,2,3,1,3,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm20, %ymm26, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm24
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm15, %xmm20
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm30
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm26
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm24, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm29
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm21
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm25, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm16, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm18
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm8
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3],ymm8[4],ymm14[5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10,11],ymm8[12],ymm14[13],ymm8[14],ymm14[15]
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm25, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm10, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm9
+; AVX512-FCP-NEXT:    vpsrlq $48, %xmm28, %xmm3
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm21, %ymm26, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm25
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; AVX512-FCP-NEXT:    vmovdqa %xmm6, %xmm7
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,3,5,2,5,7,0,0]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm16, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0]
 ; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm5, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm14, %zmm0
+; AVX512-FCP-NEXT:    vpermd %ymm20, %ymm5, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm6
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15]
+; AVX512-FCP-NEXT:    vmovdqu64 %ymm27, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm16, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm28[2],xmm0[3],xmm28[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm21, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15]
-; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm13
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm24, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa %xmm6, %xmm14
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm12
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm16
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm25, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm5, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm15
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0],xmm12[1],xmm7[2,3]
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm9
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm18, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,6,0,5,0,0,0]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm13[2],ymm11[3],ymm13[4],ymm11[5,6],ymm13[7],ymm11[8,9],ymm13[10],ymm11[11],ymm13[12],ymm11[13,14],ymm13[15]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0]
 ; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm4, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm25
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm22
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm14, %xmm31
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm20
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm30
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm18, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpermd %ymm20, %ymm4, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm26
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm9[0],xmm10[1],xmm9[2,3]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm14[2],ymm8[3],ymm14[4],ymm8[5,6],ymm14[7],ymm8[8,9],ymm14[10],ymm8[11],ymm14[12],ymm8[13,14],ymm14[15]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
-; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm27, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7]
-; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm26, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm19
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
+; AVX512-FCP-NEXT:    vpermd %ymm21, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm30
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm12[2],xmm7[3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4],ymm13[5],ymm11[6,7],ymm13[8],ymm11[9,10],ymm13[11],ymm11[12],ymm13[13],ymm11[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512-FCP-NEXT:    vpermd %ymm20, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm29
+; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm13
+; AVX512-FCP-NEXT:    vmovdqa 608(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm13[2],ymm2[3],ymm13[4],ymm2[5,6],ymm13[7],ymm2[8,9],ymm13[10],ymm2[11],ymm13[12],ymm2[13,14],ymm13[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3],ymm12[4],ymm11[5,6],ymm12[7],ymm11[8,9],ymm12[10],ymm11[11],ymm12[12],ymm11[13,14],ymm12[15]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15]
-; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3))
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm25, %ymm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm25, %zmm21
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm23
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm25
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm7
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm12
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4))
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm5
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm27, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm26, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm9 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm24, %ymm3
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm24, %zmm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX512-FCP-NEXT:    movb $7, %al
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpandnq %zmm24, %zmm27, %zmm17
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = (zmm22 & zmm27) | zmm17
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,3,1,4,6,3]
+; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4],ymm12[5],ymm11[6,7],ymm12[8],ymm11[9,10],ymm12[11],ymm11[12],ymm12[13],ymm11[14,15]
+; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm5, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [8,9,10,11,12,21,22,23]
+; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm28, %zmm17
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm13[0],ymm2[1,2],ymm13[3],ymm2[4],ymm13[5],ymm2[6,7],ymm13[8],ymm2[9,10],ymm13[11],ymm2[12],ymm13[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm3
+; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vpandnq %zmm25, %zmm27, %zmm24
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = (zmm18 & zmm27) | zmm24
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm24
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3],ymm10[4],ymm9[5,6],ymm10[7],ymm9[8,9],ymm10[10],ymm9[11],ymm10[12],ymm9[13,14],ymm10[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3,4],xmm0[5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,2,0,0,5,7,2,4]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
+; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm16, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT:    vpandnq %zmm6, %zmm27, %zmm25
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (zmm0 & zmm27) | zmm25
+; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm28, %zmm25
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15]
-; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4],xmm0[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm16, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5],ymm13[6],ymm3[7,8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13],ymm13[14],ymm3[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm16
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (zmm0 & zmm27) | zmm3
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm3
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,3,0,0,5,0,2,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm14, %ymm20
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm13 = [4,1,6,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15]
+; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm14
+; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm7 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm7 & (zmm26 ^ zmm1))
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm23
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm25
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm7 & (zmm30 ^ zmm0))
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm0 = mem[0,1],xmm0[2],mem[3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = ymm1[0],mem[1,2],ymm1[3],mem[4],ymm1[5],mem[6,7],ymm1[8],mem[9,10],ymm1[11],mem[12],ymm1[13],mem[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512-FCP-NEXT:    vpermd %ymm21, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm26, %ymm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm26, %zmm21
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5],ymm7[6],ymm13[7,8],ymm7[9],ymm13[10,11],ymm7[12],ymm13[13],ymm7[14],ymm13[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm30, %ymm1
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm30, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3],ymm15[4],ymm1[5,6],ymm15[7],ymm1[8,9],ymm15[10],ymm1[11],ymm15[12],ymm1[13,14],ymm15[15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,0,0,6,0,3,5]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm16, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5],ymm14[6],ymm9[7,8],ymm14[9],ymm9[10,11],ymm14[12],ymm9[13],ymm14[14],ymm9[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-FCP-NEXT:    movb $7, %al
+; AVX512-FCP-NEXT:    kmovw %eax, %k1
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k1}
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm29, %ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5,6],xmm6[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm29, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5],ymm2[6],ymm9[7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13],ymm2[14],ymm9[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4],xmm14[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm9[2],ymm4[3],ymm9[4],ymm4[5,6],ymm9[7],ymm4[8,9],ymm9[10],ymm4[11],ymm9[12],ymm4[13,14],ymm9[15]
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm16, %ymm9
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k1}
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0],ymm13[1],ymm7[2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10],ymm13[11],ymm7[12,13],ymm13[14],ymm7[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5,6],xmm9[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm8, %ymm6
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm8, %zmm2
+; AVX512-FCP-NEXT:    vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm6 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (zmm4 & zmm27) | zmm6
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm28, %zmm6
+; AVX512-FCP-NEXT:    vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (zmm8 & zmm27) | zmm3
+; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm22, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm24, %zmm18, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rdx)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm3, 64(%rsi)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm3, 64(%rdx)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm3, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
+; AVX512-FCP-NEXT:    vmovaps %zmm3, 64(%rcx)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm3, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r8)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
-; AVX512-FCP-NEXT:    addq $552, %rsp # imm = 0x228
+; AVX512-FCP-NEXT:    addq $648, %rsp # imm = 0x288
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride5_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $552, %rsp # imm = 0x228
-; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm6
-; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %ymm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    subq $344, %rsp # imm = 0x158
+; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %ymm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4],ymm9[5],ymm8[6,7],ymm9[8],ymm8[9,10],ymm9[11],ymm8[12],ymm9[13],ymm8[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm8
-; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm7
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15]
-; AVX512DQ-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm11
+; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm10
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
+; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vporq %ymm2, %ymm3, %ymm19
-; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm15
-; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX512DQ-NEXT:    vmovdqa 176(%rdi), %xmm12
-; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %xmm14
+; AVX512DQ-NEXT:    vporq %ymm2, %ymm3, %ymm24
+; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm13
+; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm12
+; AVX512DQ-NEXT:    vmovdqa 176(%rdi), %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 160(%rdi), %xmm16
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm10
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15]
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm7
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm6
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
@@ -8856,64 +8861,100 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+; AVX512DQ-NEXT:    vporq %ymm0, %ymm1, %ymm23
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15]
-; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqu %ymm9, (%rsp) # 32-byte Spill
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm15
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vporq %ymm0, %ymm1, %ymm28
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm18
-; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm24
+; AVX512DQ-NEXT:    vporq %ymm0, %ymm1, %ymm21
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm8
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm16[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm10[0],xmm15[1],xmm10[2,3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+; AVX512DQ-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
+; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm9
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm29 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm22
+; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm19
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm16
-; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm30
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0],xmm8[1],xmm14[2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm27
+; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm20
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm8
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm11
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm7
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm15[2],xmm10[3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm12
-; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm28
+; AVX512DQ-NEXT:    vmovdqa64 %ymm9, %ymm16
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
@@ -8924,819 +8965,720 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 464(%rdi), %xmm8
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa %xmm11, %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm9
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm8[1],xmm3[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm22
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm23
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vmovdqa 576(%rdi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa 608(%rdi), %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm17
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 512(%rdi), %ymm5
-; AVX512DQ-NEXT:    vmovdqa 544(%rdi), %ymm13
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10,11],ymm5[12],ymm13[13],ymm5[14],ymm13[15]
+; AVX512DQ-NEXT:    vmovdqa 464(%rdi), %xmm12
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %xmm9
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm15, %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm15, %xmm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm31 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512DQ-NEXT:    vmovdqa 512(%rdi), %ymm6
+; AVX512DQ-NEXT:    vmovdqa 544(%rdi), %ymm5
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 496(%rdi), %xmm21
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 480(%rdi), %xmm7
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm19))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm10, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm16[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm24
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm30[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0],xmm6[1],xmm9[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm25
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm26
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm22
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm30
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpsrlq $48, %xmm21, %xmm13
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm10, %ymm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm9
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm7
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm14
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm17
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm24[0,3,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm20
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm1
-; AVX512DQ-NEXT:    vpsrlq $48, %xmm16, %xmm11
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm19
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm6[2],xmm5[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm23
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 496(%rdi), %xmm18
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 480(%rdi), %xmm17
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm17[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm9[0],xmm12[1],xmm9[2,3]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm26
+; AVX512DQ-NEXT:    vmovdqa 576(%rdi), %ymm14
+; AVX512DQ-NEXT:    vmovdqa 608(%rdi), %ymm13
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm29 & (zmm26 ^ zmm24))
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm28))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm26, %ymm15
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm26, %zmm30
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm15
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm26
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm27[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0],xmm10[1],xmm7[2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm22
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm7
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm29 & (zmm23 ^ zmm0))
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm23, %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm23, %zmm28
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm17[0,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpsrlq $48, %xmm18, %xmm8
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm12[2],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm16
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4],ymm14[5],ymm13[6,7],ymm14[8],ymm13[9,10],ymm14[11],ymm13[12],ymm14[13],ymm13[14,15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm15
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2],xmm7[3]
+; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ mem))
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm12
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm25
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm27[0,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpsrlq $48, %xmm26, %xmm8
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm1[0,1],xmm10[2],xmm1[3]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm22
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3]
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm0 ^ (zmm29 & (zmm21 ^ zmm0))
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm21, %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm21, %zmm24
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6],ymm0[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10,11],ymm12[12],ymm0[13],ymm12[14],ymm0[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm5
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm8[0],xmm4[1],xmm8[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm28
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm27 & (zmm13 ^ zmm2))
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm8
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm14
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm30[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm5
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm21[2],xmm11[3],xmm21[3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm2[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm21
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm13, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm26
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm20[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm17
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm31
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm13
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendw $82, (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm10 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm0))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm23
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm13
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4],ymm0[5],ymm10[6],ymm0[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm14
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm15
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm17[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm3
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm18[2],xmm8[3],xmm18[3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm7[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm2[0],xmm9[1],xmm2[2,3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm0))
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm21
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm3[2],ymm8[3],ymm3[4],ymm8[5,6],ymm3[7],ymm8[8,9],ymm3[10],ymm8[11],ymm3[12],ymm8[13,14],ymm3[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm5[0],xmm6[1],xmm5[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm7
-; AVX512DQ-NEXT:    vmovdqa %xmm5, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2],xmm2[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm3[0],xmm4[1],xmm3[2,3]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm7[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm9[2],xmm2[3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm0))
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm9
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3],xmm0[4,5],xmm9[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm0))
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1,2,3],xmm0[4,5],xmm8[6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm8, %ymm9
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm25
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5,6],ymm8[7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm11, %xmm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm4, %zmm1
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4],ymm0[5,6],ymm7[7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm7[3,4],xmm9[5,6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm4[2],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
 ; AVX512DQ-NEXT:    movb $7, %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6],xmm5[7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm5
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendw $107, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm5 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm31, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
-; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovaps %zmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovaps %zmm2, 64(%rsi)
-; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovaps %zmm2, 64(%rdx)
-; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovaps %zmm2, (%rdx)
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %zmm28, (%rsi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm30, 64(%rsi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm24, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 64(%rcx)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovaps %zmm2, 64(%rcx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm23, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 64(%r8)
+; AVX512DQ-NEXT:    vmovaps %zmm2, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 64(%r8)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm2, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%r9)
-; AVX512DQ-NEXT:    addq $552, %rsp # imm = 0x228
+; AVX512DQ-NEXT:    addq $344, %rsp # imm = 0x158
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $552, %rsp # imm = 0x228
+; AVX512DQ-FCP-NEXT:    subq $648, %rsp # imm = 0x288
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 496(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 480(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 544(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 608(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm8, (%rsp) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm28
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm30
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 416(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm26
+; AVX512DQ-FCP-NEXT:    vmovdqa 496(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 480(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm29
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 544(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [2,4,7,1,4,6,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm20, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [8,9,3,2,4,5,7,6]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm14, %ymm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,3,1,3,0,3,5,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm21
+; AVX512DQ-FCP-NEXT:    vpermd %ymm21, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm27
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm10, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm17, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 448(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 416(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm18, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 176(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %xmm15
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm15, %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm25
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm17, %ymm3
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm18, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,0,2,4,6,1,3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm16, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 176(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10,11],ymm13[12],ymm11[13],ymm13[14],ymm11[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm20, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm20
+; AVX512DQ-FCP-NEXT:    vpermd %ymm20, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm30
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm21
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm29
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm14[1,2],ymm6[3],ymm14[4],ymm6[5],ymm14[6,7],ymm6[8],ymm14[9,10],ymm6[11],ymm14[12],ymm6[13],ymm14[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm22
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm17, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm31, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm5
-; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm20, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3]
-; AVX512DQ-FCP-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm25, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm28
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm20, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm27, %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm24, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm25, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [2,0,0,0,4,7,1,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vporq %ymm4, %ymm2, %ymm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm6
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm18, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [0,2,5,7,4,7,0,0]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13],ymm11[14],ymm13[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [1,3,2,3,1,3,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm20, %ymm26, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm24
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm15, %xmm20
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm30
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm26
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm24, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm29
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm21
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm25, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3],ymm8[4],ymm14[5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10,11],ymm8[12],ymm14[13],ymm8[14],ymm14[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm10, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm9
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm28, %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm21, %ymm26, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm25
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, %xmm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,3,5,2,5,7,0,0]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm16, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0]
 ; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm5, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermd %ymm20, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm27, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm28[2],xmm0[3],xmm28[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm21, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm13
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm24, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, %xmm14
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm12
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm16
-; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm25, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0],xmm12[1],xmm7[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm18, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,6,0,5,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm13[2],ymm11[3],ymm13[4],ymm11[5,6],ymm13[7],ymm11[8,9],ymm13[10],ymm11[11],ymm13[12],ymm11[13,14],ymm13[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0]
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm25
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm14, %xmm31
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm30
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm18, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermd %ymm20, %ymm4, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm26
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm9[0],xmm10[1],xmm9[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm14[2],ymm8[3],ymm14[4],ymm8[5,6],ymm14[7],ymm8[8,9],ymm14[10],ymm8[11],ymm14[12],ymm8[13,14],ymm14[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
-; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm27, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm26, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm19
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
+; AVX512DQ-FCP-NEXT:    vpermd %ymm21, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm30
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm12[2],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4],ymm13[5],ymm11[6,7],ymm13[8],ymm11[9,10],ymm13[11],ymm11[12],ymm13[13],ymm11[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm20, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 608(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm13[2],ymm2[3],ymm13[4],ymm2[5,6],ymm13[7],ymm2[8,9],ymm13[10],ymm2[11],ymm13[12],ymm2[13,14],ymm13[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3],ymm12[4],ymm11[5,6],ymm12[7],ymm11[8,9],ymm12[10],ymm11[11],ymm12[12],ymm11[13,14],ymm12[15]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm25, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm25, %zmm21
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm25
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm12
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm27, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm26, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm24, %ymm3
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm24, %zmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    movb $7, %al
-; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpandnq %zmm24, %zmm27, %zmm17
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = (zmm22 & zmm27) | zmm17
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,3,1,4,6,3]
+; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4],ymm12[5],ymm11[6,7],ymm12[8],ymm11[9,10],ymm12[11],ymm11[12],ymm12[13],ymm11[14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm5, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [8,9,10,11,12,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm28, %zmm17
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm13[0],ymm2[1,2],ymm13[3],ymm2[4],ymm13[5],ymm2[6,7],ymm13[8],ymm2[9,10],ymm13[11],ymm2[12],ymm13[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vpandnq %zmm25, %zmm27, %zmm24
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = (zmm18 & zmm27) | zmm24
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3],ymm10[4],ymm9[5,6],ymm10[7],ymm9[8,9],ymm10[10],ymm9[11],ymm10[12],ymm9[13,14],ymm10[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3,4],xmm0[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,2,0,0,5,7,2,4]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm16, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT:    vpandnq %zmm6, %zmm27, %zmm25
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (zmm0 & zmm27) | zmm25
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm28, %zmm25
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4],xmm0[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5],ymm13[6],ymm3[7,8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13],ymm13[14],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm16
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = (zmm0 & zmm27) | zmm3
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm3
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,3,0,0,5,0,2,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm14, %ymm20
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm13 = [4,1,6,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm14
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm7 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm7 & (zmm26 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm25
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm7 & (zmm30 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm0 = mem[0,1],xmm0[2],mem[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = ymm1[0],mem[1,2],ymm1[3],mem[4],ymm1[5],mem[6,7],ymm1[8],mem[9,10],ymm1[11],mem[12],ymm1[13],mem[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm21, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm26, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm26, %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm7
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5],ymm7[6],ymm13[7,8],ymm7[9],ymm13[10,11],ymm7[12],ymm13[13],ymm7[14],ymm13[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm30, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3],ymm15[4],ymm1[5,6],ymm15[7],ymm1[8,9],ymm15[10],ymm1[11],ymm15[12],ymm1[13,14],ymm15[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,0,0,6,0,3,5]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5],ymm14[6],ymm9[7,8],ymm14[9],ymm9[10,11],ymm14[12],ymm9[13],ymm14[14],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    movb $7, %al
+; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k1}
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm29, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5,6],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm29, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5],ymm2[6],ymm9[7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13],ymm2[14],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4],xmm14[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm9[2],ymm4[3],ymm9[4],ymm4[5,6],ymm9[7],ymm4[8,9],ymm9[10],ymm4[11],ymm9[12],ymm4[13,14],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm16, %ymm9
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k1}
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0],ymm13[1],ymm7[2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10],ymm13[11],ymm7[12,13],ymm13[14],ymm7[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5,6],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm8, %ymm6
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm8, %zmm2
+; AVX512DQ-FCP-NEXT:    vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = (zmm4 & zmm27) | zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm28, %zmm6
+; AVX512DQ-FCP-NEXT:    vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = (zmm8 & zmm27) | zmm3
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm3
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm22, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm24, %zmm18, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 64(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 64(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 64(%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 64(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 64(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
-; AVX512DQ-FCP-NEXT:    addq $552, %rsp # imm = 0x228
+; AVX512DQ-FCP-NEXT:    addq $648, %rsp # imm = 0x288
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index feb75b21d5c8d..5e75ffa465b61 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -483,28 +483,29 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpsrld $16, %xmm1, %xmm4
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpsrld $16, %xmm1, %xmm3
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
-; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm2[2,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX2-NEXT:    vmovq %xmm6, (%rcx)
 ; AVX2-NEXT:    vmovq %xmm5, (%r8)
-; AVX2-NEXT:    vmovq %xmm1, (%r9)
-; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-NEXT:    vmovq %xmm1, (%rax)
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride6_vf4:
@@ -513,27 +514,28 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpsrld $16, %xmm1, %xmm4
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpsrld $16, %xmm1, %xmm3
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm2[2,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX2-FP-NEXT:    vmovq %xmm6, (%rcx)
 ; AVX2-FP-NEXT:    vmovq %xmm5, (%r8)
-; AVX2-FP-NEXT:    vmovq %xmm1, (%r9)
-; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-FP-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-FP-NEXT:    vmovq %xmm1, (%rax)
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride6_vf4:
@@ -542,152 +544,143 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpsrld $16, %xmm1, %xmm4
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpsrld $16, %xmm1, %xmm3
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm2[2,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX2-FCP-NEXT:    vmovq %xmm6, (%rcx)
 ; AVX2-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX2-FCP-NEXT:    vmovq %xmm1, (%r9)
-; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-FCP-NEXT:    vmovq %xmm1, (%rax)
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i16_stride6_vf4:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512-NEXT:    vpermd %zmm2, %zmm1, %zmm1
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,9,6,3]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512-NEXT:    vpermd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpsrld $16, 16(%rdi), %xmm2
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = mem[0,1,0,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],mem[3],xmm2[4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,1,10,7]
+; AVX512-NEXT:    vpermd %zmm1, %zmm3, %zmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512-NEXT:    vpermd %zmm2, %zmm5, %zmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512-NEXT:    vpermd %zmm1, %zmm5, %zmm1
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512-NEXT:    vmovq %xmm1, (%r8)
+; AVX512-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512-NEXT:    vmovq %xmm5, (%r9)
-; AVX512-NEXT:    vmovq %xmm2, (%rax)
+; AVX512-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride6_vf4:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,9,6,3]
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpsrld $16, 16(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],mem[3],xmm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,1,10,7]
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm5, %zmm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm5, %zmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm1, (%r8)
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rax)
+; AVX512-FCP-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride6_vf4:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,9,6,3]
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpsrld $16, 16(%rdi), %xmm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = mem[0,1,0,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],mem[3],xmm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,1,10,7]
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm5, %zmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm5, %zmm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-NEXT:    vmovq %xmm1, (%r8)
+; AVX512DQ-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512DQ-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-NEXT:    vmovq %xmm2, (%rax)
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride6_vf4:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,9,6,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpsrld $16, 16(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],mem[3],xmm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,1,10,7]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -1028,16 +1021,16 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm6
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
@@ -1045,13 +1038,13 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
 ; AVX2-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX2-NEXT:    vmovdqa %xmm5, (%rdx)
@@ -1092,15 +1085,15 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm6
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm2
@@ -1108,13 +1101,13 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rsi)
 ; AVX2-FP-NEXT:    vmovdqa %xmm5, (%rdx)
@@ -1129,20 +1122,20 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm4
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm3
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [6,5,4,7]
+; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm6
+; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX2-FCP-NEXT:    vpslld $16, %xmm3, %xmm7
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
 ; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
 ; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
@@ -1150,36 +1143,36 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpbroadcastw 74(%rdi), %xmm6
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,7,6,5]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm6, %ymm6
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm6[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm1[2,2,2,2,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %xmm1, (%rsi)
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%r8)
@@ -1197,25 +1190,22 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm4[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm7
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
+; AVX512-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3],xmm7[4,5],xmm4[6,7]
 ; AVX512-NEXT:    vpbroadcastw 74(%rdi), %xmm6
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm6
@@ -1232,22 +1222,22 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm5
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[2,2,2,2,4,5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
 ; AVX512-NEXT:    vmovdqa %xmm2, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm5, (%rdx)
+; AVX512-NEXT:    vmovdqa %xmm4, (%rdx)
 ; AVX512-NEXT:    vmovdqa %xmm8, (%rcx)
 ; AVX512-NEXT:    vmovdqa %xmm6, (%r8)
 ; AVX512-NEXT:    vmovdqa %xmm1, (%r9)
@@ -1258,63 +1248,64 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-LABEL: load_i16_stride6_vf8:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [6,5,4,7]
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX512-FCP-NEXT:    vpslld $16, %xmm3, %xmm7
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3]
-; AVX512-FCP-NEXT:    vpbroadcastw 74(%rdi), %xmm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastw 74(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [10,1,10,3]
+; AVX512-FCP-NEXT:    vpermd (%rdi), %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [4,7,6,5]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm9, %ymm8, %ymm8
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3],xmm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm1[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%r9)
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -1327,25 +1318,22 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm4[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm7
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3],xmm7[4,5],xmm4[6,7]
 ; AVX512DQ-NEXT:    vpbroadcastw 74(%rdi), %xmm6
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm6
@@ -1362,22 +1350,22 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm5
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[2,2,2,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm8, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm6, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa %xmm1, (%r9)
@@ -1388,63 +1376,64 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-LABEL: load_i16_stride6_vf8:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [6,5,4,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vpslld $16, %xmm3, %xmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastw 74(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastw 74(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [10,1,10,3]
+; AVX512DQ-FCP-NEXT:    vpermd (%rdi), %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [4,7,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm9, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3],xmm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm1[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -2055,31 +2044,32 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i16_stride6_vf16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm3
-; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm5
+; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm8[2,2,2,2,4,5,6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7]
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3]
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm5[2,3]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm1[0,1],ymm5[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0,1,2],ymm10[3,4,5,6,7],ymm1[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm12[2,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
@@ -2091,11 +2081,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6],xmm9[7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,0,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
@@ -2105,303 +2095,296 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm12, %xmm12
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,6,6,6,6]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm15 = xmm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm11, %ymm10, %ymm7, %ymm7
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[0,1,0,2,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendvb %ymm11, %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX2-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendvb %ymm11, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vmovdqa %ymm1, (%rsi)
+; AVX2-NEXT:    vmovdqa %ymm5, (%rdx)
 ; AVX2-NEXT:    vmovdqa %ymm8, (%rcx)
 ; AVX2-NEXT:    vmovdqa %ymm9, (%r8)
-; AVX2-NEXT:    vmovdqa %ymm5, (%r9)
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovdqa %ymm1, (%rax)
+; AVX2-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX2-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride6_vf16:
 ; AVX2-FP:       # %bb.0:
+; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm4
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm8, %xmm7
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm11, %xmm7
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7]
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3]
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3]
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm10
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm12[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6],xmm10[7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
-; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm13[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm5, %ymm7, %ymm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm3, %ymm0
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm6
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0]
+; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm10, %ymm7, %ymm7
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm10
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm10, %xmm12
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5],xmm12[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%rsi)
-; AVX2-FP-NEXT:    vmovdqa %ymm4, (%rdx)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm10, %xmm6
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX2-FP-NEXT:    vmovdqa %ymm5, (%rdx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm8, (%rcx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm9, (%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm2, (%r9)
-; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-FP-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride6_vf16:
 ; AVX2-FCP:       # %bb.0:
+; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm7
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm7
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [6,5,4,7]
+; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm12, %ymm12
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm10
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm12[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm11[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6],xmm9[7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,5,6,5]
+; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm9
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm13[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [4,7,6,5]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm12[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm13[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1,2],xmm15[3],xmm14[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm12[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm13[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1,2],xmm12[3],xmm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm5, %ymm7, %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm3, %ymm0
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm6
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
+; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0]
+; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm10, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm10
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm12
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5],xmm12[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rsi)
-; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm3, %ymm4, %ymm3
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm6
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm9, (%r8)
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%r9)
-; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
@@ -2528,8 +2511,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm5
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm6
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [6,5,4,7]
+; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm6, %ymm9
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3]
@@ -2539,6 +2522,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm12
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm13
@@ -2546,7 +2530,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
@@ -2559,68 +2544,74 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [10,1,10,3]
+; AVX512-FCP-NEXT:    vpermd (%rdi), %zmm8, %zmm9
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm9[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,7,6,5]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1,2],xmm8[3],xmm11[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm11[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm8[0,1,2,3,4],xmm12[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [12,5,14,5]
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm14
+; AVX512-FCP-NEXT:    vpermd %zmm14, %zmm12, %zmm12
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,6,4]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm16 = [2,9,8,11]
+; AVX512-FCP-NEXT:    vpermd %zmm14, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1,2],xmm1[3],xmm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm11[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm9[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,7,5,6,5]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5,6],xmm9[7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm3[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm7) | ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm12
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm10
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [0,3,2,9]
+; AVX512-FCP-NEXT:    vpermd %zmm14, %zmm11, %zmm11
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm10[4],xmm5[5],xmm10[6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm11)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm7)
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rsi)
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm9, (%r8)
@@ -2752,8 +2743,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm5
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [6,5,4,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm6, %ymm9
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3]
@@ -2763,6 +2754,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm12
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm13
@@ -2770,7 +2762,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
@@ -2783,68 +2776,74 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [10,1,10,3]
+; AVX512DQ-FCP-NEXT:    vpermd (%rdi), %zmm8, %zmm9
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm9[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,7,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1,2],xmm8[3],xmm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm11[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm8[0,1,2,3,4],xmm12[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [12,5,14,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm14
+; AVX512DQ-FCP-NEXT:    vpermd %zmm14, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm16 = [2,9,8,11]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm14, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1,2],xmm1[3],xmm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm11[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm9[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5,6],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm3[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm7) | ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm10
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [0,3,2,9]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm14, %zmm11, %zmm11
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm10[4],xmm5[5],xmm10[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm11)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm7)
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, (%r8)
@@ -2857,36 +2856,28 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-LABEL: load_i16_stride6_vf16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [32,33,34,2,8,14,20,26,40,41,42,2,8,14,20,26]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-NEXT:    vpermw %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm5, %zmm0
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [32,33,34,3,9,15,21,27,40,41,42,3,9,15,21,27]
 ; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
-; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermw %zmm5, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [32,33,34,4,10,16,22,28,40,41,42,4,10,16,22,28]
 ; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm5, %zmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
-; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermw %zmm5, %zmm6, %zmm6
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [32,33,34,5,11,17,23,29,40,41,42,5,11,17,23,29]
 ; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm5, %zmm6
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
@@ -2912,36 +2903,28 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-LABEL: load_i16_stride6_vf16:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
-; AVX512BW-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [32,33,34,2,8,14,20,26,40,41,42,2,8,14,20,26]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm5, %zmm0
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
-; AVX512BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [32,33,34,3,9,15,21,27,40,41,42,3,9,15,21,27]
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm5, %zmm1
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
-; AVX512BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [32,33,34,4,10,16,22,28,40,41,42,4,10,16,22,28]
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm6, %zmm5, %zmm2
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
-; AVX512BW-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [32,33,34,5,11,17,23,29,40,41,42,5,11,17,23,29]
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm7, %zmm5, %zmm6
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512BW-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
@@ -2967,36 +2950,28 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-LABEL: load_i16_stride6_vf16:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
-; AVX512DQ-BW-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [32,33,34,2,8,14,20,26,40,41,42,2,8,14,20,26]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm5, %zmm0
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
-; AVX512DQ-BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [32,33,34,3,9,15,21,27,40,41,42,3,9,15,21,27]
 ; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm5, %zmm1
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
-; AVX512DQ-BW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [32,33,34,4,10,16,22,28,40,41,42,4,10,16,22,28]
 ; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm6, %zmm5, %zmm2
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
-; AVX512DQ-BW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [32,33,34,5,11,17,23,29,40,41,42,5,11,17,23,29]
 ; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm7, %zmm5, %zmm6
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512DQ-BW-NEXT:    # ymm7 = mem[0,1,0,1]
@@ -3022,36 +2997,28 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf16:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
-; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [32,33,34,2,8,14,20,26,40,41,42,2,8,14,20,26]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm5, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
-; AVX512DQ-BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [32,33,34,3,9,15,21,27,40,41,42,3,9,15,21,27]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm5, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
-; AVX512DQ-BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [32,33,34,4,10,16,22,28,40,41,42,4,10,16,22,28]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm6, %zmm5, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
-; AVX512DQ-BW-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [32,33,34,5,11,17,23,29,40,41,42,5,11,17,23,29]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm7, %zmm5, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512DQ-BW-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
@@ -4135,51 +4102,50 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i16_stride6_vf32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $488, %rsp # imm = 0x1E8
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX2-NEXT:    subq $520, %rsp # imm = 0x208
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm12
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX2-NEXT:    vmovdqu %ymm7, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm10
 ; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm3
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[2,3],ymm2[2,3]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1]
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX2-NEXT:    vpshufb %ymm6, %ymm1, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
+; AVX2-NEXT:    vpshufb %ymm8, %ymm1, %ymm4
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm12[2],ymm5[3,4],ymm12[5],ymm5[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm2, %xmm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm9, %ymm4, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm11, %ymm4, %ymm4
 ; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %ymm11, %ymm5
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
-; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm4, %xmm11
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm9
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm9[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3],xmm13[4],xmm11[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-NEXT:    vpshufb %ymm6, %ymm11, %ymm6
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm8, %ymm6, %ymm6
+; AVX2-NEXT:    vmovdqu %ymm15, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0],ymm15[1],ymm14[2,3,4,5],ymm15[6],ymm14[7]
+; AVX2-NEXT:    vpshufb %ymm8, %ymm13, %ymm8
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm11, %ymm8, %ymm6
 ; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
@@ -4187,23 +4153,23 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb %ymm3, %ymm11, %ymm1
-; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7]
+; AVX2-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
+; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6],ymm7[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm9[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm9
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm9[0,2,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm1
+; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm1
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
-; AVX2-NEXT:    vpshufb %ymm15, %ymm12, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0],ymm14[1],ymm15[2,3,4,5],ymm14[6],ymm15[7]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm15, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm11
 ; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm13
@@ -4211,8 +4177,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4],xmm8[5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3,4],xmm7[5,6,7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -4221,28 +4187,28 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm14
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm3[2,2,2,2,4,5,6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm5
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4],xmm5[5,6,7]
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $146, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
+; AVX2-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5,6],mem[7]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm0, %xmm8
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[0,2,0,3]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm10 = mem[0],ymm10[1],mem[2,3,4,5],ymm10[6],mem[7]
-; AVX2-NEXT:    vpshufb %ymm15, %ymm10, %ymm15
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,2,0,3]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm12[2],xmm8[3],xmm12[4,5],xmm8[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm12 = mem[0],ymm14[1],mem[2,3,4,5],ymm14[6],mem[7]
+; AVX2-NEXT:    vpshufb %ymm10, %ymm12, %ymm10
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm9[2,1,0,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
@@ -4250,22 +4216,22 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6],xmm6[7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-NEXT:    vpshufb %ymm6, %ymm12, %ymm7
+; AVX2-NEXT:    vpshufb %ymm6, %ymm15, %ymm8
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm8, %xmm8
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3],xmm8[4,5],xmm1[6],xmm8[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3],xmm7[4,5],xmm1[6],xmm7[7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm1
+; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm1
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX2-NEXT:    vpshufb %ymm6, %ymm10, %ymm2
+; AVX2-NEXT:    vpshufb %ymm6, %ymm12, %ymm2
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[2,1,0,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
@@ -4279,115 +4245,117 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vpblendd $107, (%rsp), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm7 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,1,2,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,4]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-NEXT:    vpshufb %ymm11, %ymm8, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm7, %ymm2
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[2,1,2,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm10[2,1,2,0,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3],xmm2[4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm7, %xmm5
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,6,5,6,4]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6],xmm5[7]
+; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[0,1],ymm14[2],mem[3],ymm14[4],mem[5,6],ymm14[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT:    vpshufb %ymm11, %ymm2, %ymm11
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm10[3,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm1[2,1,2,0,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm14[0,1,2,3,4],xmm8[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5,6],xmm6[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm7, %ymm8
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7],ymm8[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2],xmm8[3],xmm3[4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm7[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,5,6,5]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpshufb %ymm6, %ymm5, %ymm4
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm4
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6],ymm11[7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm6
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
+; AVX2-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm6[0,1,0,2,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
 ; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
@@ -4413,94 +4381,96 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm6, 32(%rcx)
 ; AVX2-NEXT:    vmovdqa %ymm9, (%rcx)
-; AVX2-NEXT:    vmovdqa %ymm5, 32(%r8)
+; AVX2-NEXT:    vmovdqa %ymm3, 32(%r8)
 ; AVX2-NEXT:    vmovdqa %ymm0, (%r8)
-; AVX2-NEXT:    vmovdqa %ymm4, 32(%r9)
-; AVX2-NEXT:    vmovdqa %ymm3, (%r9)
+; AVX2-NEXT:    vmovdqa %ymm5, 32(%r9)
+; AVX2-NEXT:    vmovdqa %ymm4, (%r9)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovdqa %ymm2, 32(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rax)
-; AVX2-NEXT:    addq $488, %rsp # imm = 0x1E8
+; AVX2-NEXT:    addq $520, %rsp # imm = 0x208
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride6_vf32:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $456, %rsp # imm = 0x1C8
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm5
-; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    subq $520, %rsp # imm = 0x208
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm12
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm10
 ; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm3
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[2,3],ymm2[2,3]
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1]
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm1, %ymm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm0
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm1, %ymm6
+; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7]
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm6, %ymm5
-; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm6, %ymm4
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm6, %xmm7
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm11, %ymm4
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm7, %ymm4, %ymm4
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm6, %xmm8
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm11
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm11[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3],xmm13[4],xmm8[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa %ymm15, %ymm4
+; AVX2-FP-NEXT:    vmovdqu %ymm15, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0],ymm15[1],ymm14[2,3,4,5],ymm15[6],ymm14[7]
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm13, %ymm7
+; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm7, %ymm5
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
 ; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm11, %ymm1
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm8, %xmm3
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm11, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm1
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3]
-; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm11, %xmm1
+; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm7, %xmm0
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm1
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[2,1,0,3]
+; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm10, %xmm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
-; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm10, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0],ymm14[1],ymm4[2,3,4,5],ymm14[6],ymm4[7]
+; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm9, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm14
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm1[2],ymm14[3,4],ymm1[5],ymm14[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm7, %xmm6
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm9
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm9, %xmm8
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm6
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm11
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm11, %xmm8
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15]
@@ -4508,187 +4478,188 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
+; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5,6],ymm12[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm12
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
 ; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm2, %xmm0
 ; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $189, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7]
-; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm15, %ymm13
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0],ymm15[1],ymm8[2,3,4,5],ymm15[6],ymm8[7]
+; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm4, %ymm13
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm3
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm10, %ymm10
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm5
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm10[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm11, %xmm11
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3],xmm11[4,5],xmm3[6],xmm11[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7],ymm5[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm4, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm6, %xmm3
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
-; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm15, %ymm3
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[1,1,1,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7]
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm4, %ymm3
 ; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
+; AVX2-FP-NEXT:    vpblendd $107, (%rsp), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm10, %xmm9
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm14, %ymm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $107, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm11, %xmm6
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm13, %ymm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm10, %xmm4
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm14, %ymm4
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm6, %xmm1
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,6,4]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm13, %ymm2
-; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm11, %xmm3
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm4, %ymm2
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm7[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm10, %xmm7
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm3[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0],xmm7[1,2],xmm12[3],xmm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm4, %xmm12
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm13[2],ymm7[3,4],ymm13[5],ymm7[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[2,1,0,3]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm12, %xmm1
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6],xmm12[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm15[2],ymm8[3],ymm15[4],ymm8[5,6],ymm15[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm12, %ymm8
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm8 = mem[0,1],ymm8[2],mem[3,4],ymm8[5],mem[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm15
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm8[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1,2],xmm11[3],xmm9[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm12, %xmm11
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm4, %ymm11
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7],ymm11[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm12, %ymm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm12, %xmm1
+; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm15, %xmm2
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6],ymm7[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm7
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm6
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm8, %xmm9
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm8
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm8, %xmm10
+; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm10[4],xmm3[5],xmm10[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm8, %xmm2
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm8, %xmm2
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rcx)
-; AVX2-FP-NEXT:    vmovdqa %ymm6, (%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm6, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm6, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rcx)
+; AVX2-FP-NEXT:    vmovdqa %ymm9, (%rcx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm4, 32(%r8)
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r8)
 ; AVX2-FP-NEXT:    vmovdqa %ymm3, 32(%r9)
@@ -4696,247 +4667,246 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%rax)
 ; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rax)
-; AVX2-FP-NEXT:    addq $456, %rsp # imm = 0x1C8
+; AVX2-FP-NEXT:    addq $520, %rsp # imm = 0x208
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride6_vf32:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $456, %rsp # imm = 0x1C8
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX2-FCP-NEXT:    subq $520, %rsp # imm = 0x208
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[2,3],ymm2[2,3]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1]
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm0
+; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7]
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm6, %ymm5
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm6, %ymm4
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm7
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm4
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm7, %ymm4, %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm8
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm11
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm11[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3],xmm13[4],xmm8[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm14, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0],ymm15[1],ymm14[2,3,4,5],ymm15[6],ymm14[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm13, %ymm7
+; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm7, %ymm4
 ; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
 ; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm8, %xmm3
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm3
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [6,5,4,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm1
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm11, %xmm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm5, %ymm13
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm0
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3,4,5],ymm14[6],ymm15[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm8, %ymm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
 ; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm6
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm8
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
+; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm9
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm12
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm0
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $189, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm5
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm0
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2],xmm0[3],xmm10[4,5],xmm0[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm10 = ymm9[0],mem[1],ymm9[2,3,4,5],mem[6],ymm9[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm10, %ymm12
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm3
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm10
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm13[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,2,3,12,13,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6],xmm0[7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm11
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3],xmm11[4,5],xmm7[6],xmm11[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm0
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm15, %ymm3
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm1
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,5,6,5]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm3
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm9
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [4,7,6,5]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm7, %ymm10
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm11
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm2[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm15[0],xmm11[1,2],xmm15[3],xmm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm14[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm12
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm12[0,1,2,3],xmm14[4],xmm12[5,6],xmm14[7]
+; AVX2-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm12 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm12, %ymm8
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm14[3,4,5,6,7],ymm8[8,9,10],ymm14[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm14 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm14[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm12[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6],xmm3[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm14, %ymm12
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $107, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm6
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm13, %ymm6
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm10, %xmm4
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7],ymm11[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm14, %ymm4
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm13, %ymm2
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm3
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm12, %ymm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm2
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm14[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6],ymm13[7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
 ; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm6
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm9
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm9
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
@@ -4948,23 +4918,23 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm2
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm4, 32(%r8)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r8)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm3, 32(%r9)
@@ -4972,231 +4942,229 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%rax)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rax)
-; AVX2-FCP-NEXT:    addq $456, %rsp # imm = 0x1C8
+; AVX2-FCP-NEXT:    addq $520, %rsp # imm = 0x208
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i16_stride6_vf32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $72, %rsp
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm14
-; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7]
+; AVX512-NEXT:    subq $136, %rsp
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm13
+; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
 ; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm1
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
-; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm4
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm13
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm10
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
+; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm15
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm6
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm24
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm26
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm1[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3,4],xmm4[5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm26
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm7
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512-NEXT:    vpshufb %xmm8, %xmm3, %xmm9
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-NEXT:    vinserti32x4 $2, %xmm3, %zmm7, %zmm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vinserti32x4 $2, %xmm4, %zmm7, %zmm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm6[2,3],mem[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm16
-; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm7
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[0,2,0,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6,7]
-; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0],ymm12[1],ymm8[2,3,4,5],ymm12[6],ymm8[7]
-; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm29
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6],ymm11[7]
+; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm16
+; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm7
+; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6,7]
+; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0],ymm12[1],ymm9[2,3,4,5],ymm12[6],ymm9[7]
+; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm22
 ; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm28
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm8
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm9
 ; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm18
-; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm20
-; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm10, %xmm9
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm10
-; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm9
-; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm9[2,3],mem[2,3]
-; AVX512-NEXT:    vinserti128 $1, 288(%rdi), %ymm9, %ymm15
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0],ymm12[1],ymm15[2,3,4,5],ymm12[6],ymm15[7]
-; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm25
-; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm27
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm19
+; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm20
+; AVX512-NEXT:    vpshufb %xmm8, %xmm7, %xmm10
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX512-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3],xmm8[4,5],xmm10[6],xmm8[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm10
+; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm8[2,3],mem[2,3]
+; AVX512-NEXT:    vinserti128 $1, 288(%rdi), %ymm8, %ymm12
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7]
+; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm24
+; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm25
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,28,29,24,25,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm30
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm29
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
 ; AVX512-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
 ; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpshufb %xmm10, %xmm5, %xmm0
-; AVX512-NEXT:    vpshufb %xmm10, %xmm3, %xmm1
+; AVX512-NEXT:    vpshufb %xmm10, %xmm4, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm0
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,30,31,26,27,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm30
 ; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm31
-; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm21
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm9[0,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,6,5,6,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
 ; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm23
 ; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm19
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm21
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm13
-; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm13
+; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm11
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm11[2],ymm13[3],ymm11[4],ymm13[5,6],ymm11[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm28
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm29
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm15
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm2[0,1,2,3,6,5,6,4]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm14
-; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm16
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm17 ^ (zmm0 & (zmm2 ^ zmm17))
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm12
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2],ymm1[3],ymm12[4],ymm1[5,6],ymm12[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7],ymm12[8,9,10],ymm14[11,12,13,14,15]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm16
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm17 ^ (zmm12 & (zmm0 ^ zmm17))
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm2))
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm8, %zmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm30
-; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm18
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm0))
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2],xmm0[3],xmm9[4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm7, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm22
+; AVX512-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm18
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm0 & (zmm5 ^ zmm2))
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm12 & (zmm4 ^ zmm0))
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm5))
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm4))
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
 ; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm2
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm4
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
 ; AVX512-NEXT:    vpshufb %xmm9, %xmm4, %xmm5
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm13[1],ymm11[2,3,4,5],ymm13[6],ymm11[7]
 ; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm8
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
 ; AVX512-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm7
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm11) | ymm10
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm14) | ymm10
 ; AVX512-NEXT:    movw $31, %ax
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm2 {%k1}
-; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm8
-; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6],ymm10[7]
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6],ymm15[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm8
 ; AVX512-NEXT:    vpshufb %xmm9, %xmm8, %xmm12
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1]
@@ -5204,30 +5172,30 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm12
-; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm13
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm12
+; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm13
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm0
-; AVX512-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm14) | ymm4
 ; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7]
@@ -5238,17 +5206,18 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
 ; AVX512-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
 ; AVX512-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqa32 %zmm30, %zmm4 {%k1}
+; AVX512-NEXT:    vmovdqa32 %zmm22, %zmm4 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm4, (%rsi)
-; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm15 = mem ^ (zmm3 & (zmm15 ^ mem))
-; AVX512-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm15, (%rdx)
+; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
+; AVX512-NEXT:    vmovdqa32 %zmm18, %zmm4 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm4, (%rdx)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm17 & (zmm10 ^ zmm2))
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm17 & (zmm1 ^ zmm0))
@@ -5256,541 +5225,536 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %zmm20, (%r8)
 ; AVX512-NEXT:    vmovdqa64 %zmm10, (%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512-NEXT:    addq $72, %rsp
+; AVX512-NEXT:    addq $136, %rsp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride6_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $136, %rsp
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6],ymm13[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm2
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm3[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm10
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512-FCP-NEXT:    subq $200, %rsp
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [6,5,4,7]
+; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm15
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm6, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm14, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm5
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm4, %zmm5, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6],ymm10[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm16
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm8
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4,5],xmm8[6,7]
-; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0],ymm0[1],ymm8[2,3,4,5],ymm0[6],ymm8[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm27
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm7
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm28
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm8[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm19
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm20
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm10
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm8
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm8, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm23
+; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm12
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm10
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm4[2],xmm10[3],xmm4[4,5],xmm10[6,7]
+; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],mem[2,3]
+; AVX512-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3,4,5],ymm11[6],ymm7[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm25
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm10
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm9
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm10
 ; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm9, %ymm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm14, %ymm24
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm25
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm9[2,3],mem[2,3]
+; AVX512-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm9, %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0],ymm13[1],ymm0[2,3,4,5],ymm13[6],ymm0[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm27
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,28,29,24,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7],ymm13[8,9,10],ymm10[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm9[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm0
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm0
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,5,5,5]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,30,31,26,27,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm30
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm15, %ymm31
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,7,6,5]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm21
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm23
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm15, %ymm31
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm9
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [10,1,10,3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX512-FCP-NEXT:    vpermd 192(%rdi), %zmm2, %zmm8
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm8[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm19
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm20 = [2,9,8,11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm16 = [12,5,14,5]
+; AVX512-FCP-NEXT:    vpermd %zmm19, %zmm20, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm4
+; AVX512-FCP-NEXT:    vpermd %zmm19, %zmm16, %zmm6
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,6,5,6,4]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm4, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm24
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm5
+; AVX512-FCP-NEXT:    vpermd (%rdi), %zmm2, %zmm4
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm15
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm27
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7]
-; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm1, %xmm16
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm16[0,1,2,1]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm16
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm17 ^ (zmm0 & (zmm11 ^ zmm17))
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm21
+; AVX512-FCP-NEXT:    vpermd %zmm21, %zmm20, %zmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX512-FCP-NEXT:    vpermd %zmm21, %zmm16, %zmm1
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,6,5,6,4]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6],xmm12[7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3],ymm0[4],ymm11[5,6],ymm0[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3,4,5,6,7],ymm14[8,9,10],ymm12[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm0[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm16
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm17 ^ (zmm12 & (zmm13 ^ zmm17))
 ; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm11))
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm13))
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm7, %zmm7
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm19
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm28
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm7 ^ (zmm0 & (zmm4 ^ zmm7))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm4))
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm20
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm18
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm6 ^ (zmm12 & (zmm3 ^ zmm6))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm22
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm17 & (zmm22 ^ zmm3))
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm0
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm6
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [0,3,2,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm2
+; AVX512-FCP-NEXT:    vpermd %zmm19, %zmm12, %zmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5],xmm2[6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm9[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm11) | ymm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm10[1],ymm15[2,3,4,5],ymm10[6],ymm15[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm13
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm15) | ymm14
 ; AVX512-FCP-NEXT:    movw $31, %ax
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm8 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm8, %xmm4
+; AVX512-FCP-NEXT:    vpermd %zmm21, %zmm12, %zmm14
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm14, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm13
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm13[4],xmm8[5],xmm13[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm13
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5],ymm11[6],ymm5[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm5[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm13, %xmm13
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm7 & ymm11)
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm9 & ymm15)
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm3
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, (%rsp), %zmm3, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
 ; AVX512-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm19, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm28, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm20, %zmm5 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm18, %zmm5 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm17 & (zmm8 ^ zmm2))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm2))
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm17 & (zmm0 ^ zmm1))
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512-FCP-NEXT:    addq $136, %rsp
+; AVX512-FCP-NEXT:    addq $200, %rsp
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride6_vf32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    pushq %rax
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm12
+; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm13
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm9
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,2,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm5
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
+; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm10
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm20
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm22
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3,4],xmm15[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm16
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm18
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm19
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[0,2,0,3]
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm22
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm24
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm14, %xmm5
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm5, %zmm16
+; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm4[2,3],mem[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm17
+; AVX512DQ-NEXT:    vmovdqa64 %ymm9, %ymm19
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm15, %xmm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[0,2,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3],xmm5[4,5],xmm0[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm4, %ymm12
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm28
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm5[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm0
-; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm4, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0],ymm8[1],ymm1[2,3,4,5],ymm8[6],ymm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm27
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm5[3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm5
+; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm20
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm21
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm4, %xmm10
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm10[3],xmm6[4,5],xmm10[6],xmm6[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm8
+; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm6
+; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm6[2,3],mem[2,3]
+; AVX512DQ-NEXT:    vinserti128 $1, 288(%rdi), %ymm6, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0],ymm10[1],ymm0[2,3,4,5],ymm10[6],ymm0[7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm1[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm11, %xmm7
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm7
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3]
-; AVX512DQ-NEXT:    vinserti128 $1, 288(%rdi), %ymm7, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm24
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm26
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm17 & (zmm10 ^ zmm16))
+; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm25
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,28,29,24,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7],ymm10[8,9,10],ymm8[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm16 ^ (zmm18 & (zmm1 ^ zmm16))
 ; AVX512DQ-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3],xmm8[4,5],xmm3[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm15, %xmm8
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm8, %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm11, %xmm1
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3],xmm8[4,5],xmm14[6],xmm8[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm8, %zmm3
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3],xmm6[4,5],xmm0[6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm5, %xmm0
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm17 & (zmm2 ^ zmm3))
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm14[3],xmm1[4,5],xmm14[6],xmm1[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm15, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm5, %xmm2
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,30,31,26,27,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm18 & (zmm3 ^ zmm1))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm28
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm29
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm30
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm16
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm25
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm13
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm30
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm9
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm13
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,6,5,6,4]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm11
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm18
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm0 & (zmm9 ^ zmm16))
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm12
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3],ymm12[4],ymm0[5,6],ymm12[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7],ymm12[8,9,10],ymm11[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm18
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm11 & (zmm10 ^ zmm16))
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm9))
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm14[1,1,1,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm10))
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm15[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1,2],xmm10[3],xmm7[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm14[1,1,1,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5,6],xmm6[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm7, %zmm6, %zmm6
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm7 ^ (zmm0 & (zmm4 ^ zmm7))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm4))
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm6 ^ (zmm11 & (zmm3 ^ zmm6))
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm16
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm3))
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm0, %xmm2
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[0,3,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[0,3,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[0,1,0,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm5
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm5
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm13
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm13[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm6[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm14) | ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = (ymm7 & ymm14) | ymm12
 ; AVX512DQ-NEXT:    movw $31, %ax
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm8
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm6, %xmm11
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm8[0,3,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm11
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0],ymm10[1],ymm11[2,3,4,5],ymm10[6],ymm11[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm8
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm7, %zmm0, %zmm4 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5,6],ymm13[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm7, %xmm10
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[0,3,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm13[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm12
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm9
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm9[1],ymm10[2,3,4,5],ymm9[6],ymm10[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm12
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5],xmm2[6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm11[0,1,2,3],xmm2[4],xmm11[5],xmm2[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
 ; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm5, %xmm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm14) | ymm3
 ; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
@@ -5798,11 +5762,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm2, (%rdx)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm17 & (zmm8 ^ zmm4))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm17 & (zmm12 ^ zmm4))
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm17 & (zmm1 ^ zmm0))
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm18, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%r8)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%r9)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -5810,251 +5774,248 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride6_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    pushq %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    subq $40, %rsp
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [6,5,4,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm25
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm4
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4,5],xmm4[6],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm16
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6],ymm10[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm4
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm3, %ymm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm26
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm6
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm6, %zmm16
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm17
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm9
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2],xmm9[3],xmm1[4,5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm4[2,3],mem[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm8[1],ymm4[2,3,4,5],ymm8[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm28
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm4[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm11
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm6, %ymm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm22
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm10
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3],xmm0[4,5],xmm10[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],mem[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm0, %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3,4,5],ymm10[6],ymm11[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm26
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm17 & (zmm10 ^ zmm16))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm24
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,28,29,24,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7],ymm11[8,9,10],ymm8[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm11 & (zmm9 ^ zmm16))
 ; AVX512DQ-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2],xmm5[3],xmm8[4,5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm7, %zmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm17 & (zmm2 ^ zmm5))
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm29
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm30
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm24
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm8, %zmm0, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm15
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3],xmm15[4,5],xmm14[6],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm14, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3],xmm6[4,5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,30,31,26,27,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm11 & (zmm3 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,7,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm31
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm15[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [10,1,10,3]
+; AVX512DQ-FCP-NEXT:    vpermd 192(%rdi), %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm16
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [2,9,8,11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm19 = [12,5,14,5]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm16, %zmm18, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm4
+; AVX512DQ-FCP-NEXT:    vpermd %zmm16, %zmm19, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,6,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5,6],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm4, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm29
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm1, %xmm17
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm17[0,1,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermd (%rdi), %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm18, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm19, %zmm7
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,6,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm13
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3],ymm13[4],ymm0[5,6],ymm13[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7],ymm13[8,9,10],ymm12[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm0[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm10 ^ (zmm12 & (zmm11 ^ zmm10))
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm19 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm11))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm15[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1,2],xmm6[3],xmm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm11
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm11[0,1,2,3],xmm5[4],xmm11[5,6],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm12 & (zmm2 ^ zmm5))
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm0 & (zmm9 ^ zmm16))
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm9))
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5,6],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm0 & (zmm3 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm19 & (zmm18 ^ zmm2))
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [0,3,2,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm2
+; AVX512DQ-FCP-NEXT:    vpermd %zmm16, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm11
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm11[0,1,2,3],xmm2[4],xmm11[5],xmm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm13) | ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm6
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm14) | ymm13
 ; AVX512DQ-FCP-NEXT:    movw $31, %ax
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm5, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4,5],ymm10[6],ymm14[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm14[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm6, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6],ymm9[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm13
+; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm10, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm13[4],xmm5[5],xmm13[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm13
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0],ymm9[1],ymm13[2,3,4,5],ymm9[6],ymm13[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm8 & ymm13)
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm11 & ymm14)
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4],xmm1[5],xmm7[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm1, %zmm1
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm14[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
@@ -6062,13 +6023,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rdx)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm17 & (zmm7 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm17 & (zmm2 ^ zmm1))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%r9)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm19 & (zmm5 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm19 & (zmm2 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-FCP-NEXT:    popq %rax
+; AVX512DQ-FCP-NEXT:    addq $40, %rsp
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -8608,7 +8569,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i16_stride6_vf64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $1272, %rsp # imm = 0x4F8
+; AVX2-NEXT:    subq $1224, %rsp # imm = 0x4C8
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX2-NEXT:    vmovaps 672(%rdi), %ymm2
@@ -8621,58 +8582,58 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm6
 ; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm7
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
-; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],ymm6[2,3]
+; AVX2-NEXT:    vmovdqu %ymm11, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm7[0,1],ymm6[0,1]
+; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm5[2,3],ymm4[2,3]
+; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
 ; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm0[0,1],ymm1[0,1]
 ; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
-; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm1, %xmm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm4, %ymm7
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0],ymm6[1],ymm11[2,3,4,5],ymm6[6],ymm11[7]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm4, %ymm6
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm7, %ymm3
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm6, %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm9, %xmm3
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm11
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm11[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm10, %ymm7
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm7, %ymm3
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm6, %xmm3
+; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm11
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm9, %ymm8
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm8, %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm8
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm10
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm7, %ymm13
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0],ymm15[1],ymm14[2,3,4,5],ymm15[6],ymm14[7]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm8, %ymm13
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm12, %ymm13, %ymm12
 ; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 608(%rdi), %ymm13
@@ -8680,9 +8641,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm12
 ; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm12, %xmm13
-; AVX2-NEXT:    vextracti128 $1, %ymm12, %xmm6
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm12, %xmm13
+; AVX2-NEXT:    vextracti128 $1, %ymm12, %xmm7
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm7[2,2,2,2,4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
@@ -8691,31 +8652,31 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm9
+; AVX2-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
-; AVX2-NEXT:    vpshufb %ymm9, %ymm10, %ymm10
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm6[0],xmm11[1],xmm6[2,3],xmm11[4],xmm6[5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm9, %ymm9
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm11, %ymm9, %ymm9
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
-; AVX2-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
+; AVX2-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
-; AVX2-NEXT:    vpshufb %ymm9, %ymm7, %ymm3
+; AVX2-NEXT:    vpshufb %ymm6, %ymm8, %ymm3
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb %ymm9, %ymm13, %ymm1
+; AVX2-NEXT:    vpshufb %ymm6, %ymm13, %ymm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
@@ -8727,11 +8688,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
 ; AVX2-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8845,7 +8806,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm5[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm15, %xmm15
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
@@ -8906,324 +8867,326 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,1,2,1]
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
+; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,6,4]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm5, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,1,2,3]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-NEXT:    vpshufb %ymm8, %ymm3, %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-NEXT:    vpshufb %xmm10, %xmm5, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,1,2,1]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
-; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm8, %ymm3, %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,6,4]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpshufb %ymm6, %ymm3, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,1,2,3]
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
+; AVX2-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[2,1,2,0,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7]
+; AVX2-NEXT:    vpshufb %xmm10, %xmm3, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm12 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,1,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpshufb %ymm6, %ymm12, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT:    vpshufb %ymm8, %ymm9, %ymm13
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7],ymm13[8,9,10],ymm4[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm1[0,0,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm2[2,1,2,0,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1,2],xmm14[3],xmm11[4,5,6,7]
+; AVX2-NEXT:    vpshufb %xmm10, %xmm12, %xmm14
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm14[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm12
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm4[2,1,0,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm14[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,6,5,6,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6],xmm13[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm15 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm4[2,1,2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[2,1,2,0,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm8, %ymm15, %ymm8
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm7 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm11
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm11[0,1,2,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm7[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm15[0,0,0,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,6,5,6,4]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5,6],xmm11[7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm11 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT:    vpshufb %ymm6, %ymm11, %ymm6
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm6
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm6[0,0,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[2,1,2,0,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1,2],xmm10[3],xmm13[4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm13 = xmm11[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm13[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[0,1,2,3,7,5,6,5]
-; AVX2-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm8 = mem[1,1,1,1,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
+; AVX2-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm10 = mem[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm13 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-NEXT:    vpshufb %ymm13, %ymm5, %ymm10
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm10 = mem[3,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm8[1,2],xmm10[3],xmm8[4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,5,6,5]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6],xmm0[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,5,6,5]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6],xmm0[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm13, %ymm12, %ymm4
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm9, %ymm2
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[0,1,2,3,7,5,6,5]
-; AVX2-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
-; AVX2-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm1 = mem[3,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm2 = mem[0,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm12, %xmm2
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,5,6,5]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[1,1,1,1,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
-; AVX2-NEXT:    vpshufb %ymm7, %ymm15, %ymm1
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[3,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm1 = mem[0,1,2,3,7,5,6,5]
+; AVX2-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm2 = mem[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpshufb %ymm13, %ymm3, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm2 = mem[3,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw $244, (%rsp), %xmm4 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm4 = mem[0,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpshufb %ymm13, %ymm11, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm11, %xmm3
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[0,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm3
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm6, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm11, %xmm4
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm11
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[0,1,0,2,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm11, %xmm5
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm13
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm13, %xmm10
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm13
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm2[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[0,1,0,2,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm13, %xmm9
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm9[4],xmm2[5],xmm9[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm15
-; AVX2-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4],xmm10[5],xmm9[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm10 = mem[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm11, %xmm11
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm9 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm15
+; AVX2-NEXT:    vpshufb %xmm8, %xmm15, %xmm8
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm9[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm0[0,1,0,2,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5],xmm8[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm9 = mem[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm11, %xmm8
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm12[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4],xmm11[5],xmm8[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm13, %xmm7
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm13, %xmm7
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm14[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5],xmm7[6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
+; AVX2-NEXT:    vpshufb %xmm1, %xmm15, %xmm1
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, 96(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, 32(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, 64(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, 96(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, 32(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, 64(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, 32(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, 96(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, 64(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm9, (%rcx)
-; AVX2-NEXT:    vmovdqa %ymm6, 96(%r8)
-; AVX2-NEXT:    vmovdqa %ymm8, 32(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%r8)
-; AVX2-NEXT:    vmovdqa %ymm5, (%r8)
-; AVX2-NEXT:    vmovdqa %ymm10, 96(%r9)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 96(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 32(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 64(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 96(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 32(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 64(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, (%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 96(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 64(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, (%rcx)
+; AVX2-NEXT:    vmovdqa %ymm10, 96(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 32(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, 64(%r8)
+; AVX2-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX2-NEXT:    vmovdqa %ymm9, 96(%r9)
 ; AVX2-NEXT:    vmovdqa %ymm2, 32(%r9)
-; AVX2-NEXT:    vmovdqa %ymm4, (%r9)
-; AVX2-NEXT:    vmovdqa %ymm3, 64(%r9)
+; AVX2-NEXT:    vmovdqa %ymm5, (%r9)
+; AVX2-NEXT:    vmovdqa %ymm4, 64(%r9)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovdqa %ymm0, 96(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm7, 32(%rax)
-; AVX2-NEXT:    vmovdqa %ymm1, 64(%rax)
-; AVX2-NEXT:    vmovdqa %ymm11, (%rax)
-; AVX2-NEXT:    addq $1272, %rsp # imm = 0x4F8
+; AVX2-NEXT:    vmovdqa %ymm6, 64(%rax)
+; AVX2-NEXT:    vmovdqa %ymm8, (%rax)
+; AVX2-NEXT:    addq $1224, %rsp # imm = 0x4C8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride6_vf64:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $1304, %rsp # imm = 0x518
+; AVX2-FP-NEXT:    subq $1272, %rsp # imm = 0x4F8
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm2
@@ -9237,7 +9200,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm6
 ; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
-; AVX2-FP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
 ; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
@@ -9251,98 +9214,98 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
 ; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
-; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm4, %ymm7
+; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm4, %ymm7
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm7, %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm7, %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm7, %xmm3
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm11
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm10, %ymm8
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm8, %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm10, %ymm8
+; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm8, %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm6, %xmm8
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm9
+; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm8
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm9
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
-; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm8, %ymm13
+; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm8, %ymm13
 ; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm12, %ymm13, %ymm12
 ; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %ymm13
 ; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm12
-; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm12, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm12, %xmm13
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm12, %xmm3
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm12, %xmm13
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm12, %xmm2
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm2[2,2,2,2,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
-; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm13, %ymm2
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm13, %ymm6
+; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm6, %ymm6
+; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm11, %xmm11
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
 ; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm10, %ymm10
 ; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
 ; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
 ; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm8, %ymm4
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm9, %xmm1
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm8, %ymm3
+; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm13, %ymm1
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm12, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7]
 ; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm0
+; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm8 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm0
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm7, %xmm0
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm8, %xmm0
 ; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9364,12 +9327,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm2
+; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm7 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm2
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
 ; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm6, %xmm2
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm7, %xmm2
 ; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
 ; AVX2-FP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
@@ -9382,9 +9345,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm8, %xmm3
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm6, %xmm3
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm4
 ; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
@@ -9393,7 +9356,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpblendd $109, (%rsp), %ymm2, %ymm5 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm2
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
@@ -9447,14 +9410,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm7, %xmm7
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
 ; AVX2-FP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm9 = mem[1,1,1,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6],xmm9[7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm15, %xmm15
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
@@ -9467,17 +9430,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm6, %xmm0
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm7, %xmm0
 ; AVX2-FP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5]
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
@@ -9488,7 +9451,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -9502,7 +9465,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm13, %xmm0
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm10, %ymm1
+; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm10, %ymm1
 ; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
@@ -9512,44 +9475,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
-; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
-; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm9, %xmm1
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm14 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
@@ -9558,252 +9486,293 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
-; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm14, %ymm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3]
+; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm2
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm14, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm8, %xmm0
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm7 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm4, %xmm3
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,1,0,3]
+; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,1,2,1]
+; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm0, %xmm2
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm13, %ymm5
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm7, %ymm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $219, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[0,3,2,1]
+; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,1,2,3]
+; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm3
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm7, %xmm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm14, %xmm3
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm2[2,1,0,3]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[0,1,2,1]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm15, %xmm2
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm8, %ymm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm3[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm8, %xmm11
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm5, %xmm0
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3]
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm15, %ymm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[2,1,0,3]
+; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm0, %xmm6
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[0,1,2,1]
+; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5,6],xmm4[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm12 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm12, %ymm6
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm9
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm9, %xmm5
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2],xmm0[3],xmm5[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm12, %xmm5
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm3 = mem[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm11, %ymm11
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm13 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm4 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5,6],xmm4[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm14, %ymm5
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm11 = mem[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm1[1,2],xmm11[3],xmm1[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm14[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm8, %xmm0
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm13, %ymm4
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm15, %xmm0
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm8, %ymm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm8, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm1
-; AVX2-FP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm4 = mem[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7]
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm2 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm7, %ymm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm3 = mem[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
-; AVX2-FP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm6 = mem[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm6
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm14, %xmm1
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm15, %ymm2
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm2 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm12, %ymm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm9, %xmm2
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm1
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm6, %xmm4
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7]
+; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm1
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm11
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm11, %xmm1
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm12, %xmm5
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm5
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5],xmm2[6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6],ymm5[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm11
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm5[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm11, %xmm5
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm12, %xmm9
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm9 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6],ymm9[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm13
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm9[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm13, %xmm9
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm15, %xmm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm9 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm13
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm13, %xmm1
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm14, %xmm10
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7]
+; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm1
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm8, %xmm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm15
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm15, %xmm10
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm11, %xmm7
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm12, %xmm8
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm11, %xmm11
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm12, %xmm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm13, %xmm6
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm14, %xmm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm15, %xmm12
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7]
+; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm13, %xmm8
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm15, %xmm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm8[4],xmm0[5],xmm8[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, (%rcx)
-; AVX2-FP-NEXT:    vmovdqa %ymm8, 96(%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm9, 32(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, 96(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, 64(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, 96(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, 64(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, 32(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, 96(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, 64(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, (%rcx)
+; AVX2-FP-NEXT:    vmovdqa %ymm10, 96(%r8)
+; AVX2-FP-NEXT:    vmovdqa %ymm14, 32(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm8, 64(%r8)
 ; AVX2-FP-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm10, 96(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm1, 32(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm5, (%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm9, 96(%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm5, 32(%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm2, (%r9)
 ; AVX2-FP-NEXT:    vmovdqa %ymm4, 64(%r9)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, 96(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm6, 32(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm2, 64(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm11, (%rax)
-; AVX2-FP-NEXT:    addq $1304, %rsp # imm = 0x518
+; AVX2-FP-NEXT:    vmovdqa %ymm7, 32(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm1, 64(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm6, (%rax)
+; AVX2-FP-NEXT:    addq $1272, %rsp # imm = 0x4F8
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
@@ -9819,16 +9788,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 416(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm5[2,3],ymm4[2,3]
+; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm5[0,1],ymm4[0,1]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
 ; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
@@ -9837,43 +9806,43 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm7
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm7
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm7, %ymm6
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm5, %ymm7, %ymm5
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm11
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3],xmm8[4],xmm7[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm8
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm8, %ymm6
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm8
+; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm8
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm9
+; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm8
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0],ymm13[1],ymm15[2,3,4,5],ymm13[6],ymm15[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm8, %ymm13
 ; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm12, %ymm13, %ymm12
 ; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm13
@@ -9881,67 +9850,67 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm12
 ; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm13
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm3
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm12, %xmm13
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm6
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm2
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm10
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm3
+; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm11, %xmm11
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm10
 ; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
 ; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm4
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm2
+; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm13, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [6,5,4,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm0
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
+; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm12, %ymm0
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm0
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm2 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm3
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm4
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm3
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm4
 ; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
@@ -9950,27 +9919,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm2
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm2
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
-; AVX2-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
+; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm15
+; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm3
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm3
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm4
 ; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
@@ -9981,17 +9949,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm2
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm2
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7]
+; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
+; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = mem[0],ymm15[1],mem[2,3,4,5],ymm15[6],mem[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vmovdqa %ymm9, %ymm15
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10012,70 +9979,69 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm9
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm13
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm10
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm14
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm10[3],xmm0[4,5],xmm10[6],xmm0[7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm10
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm12, %ymm10
 ; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm12
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4,5],xmm12[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm10, %ymm15
+; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm9 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm9, %ymm15
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7],ymm12[8,9,10],ymm11[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm7
-; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm9 = mem[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm12 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,2,3,12,13,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm12[2],xmm8[3,4],xmm12[5],xmm8[6],xmm12[7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm15
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm15
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5],xmm11[6],xmm1[7]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[3],xmm1[4,5],xmm13[6],xmm1[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm0
-; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm0 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm0
-; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm0 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
@@ -10085,310 +10051,317 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm0
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm14, %xmm0
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm9, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,5,6,5]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm6
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
 ; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm1
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm2, %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
-; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,7,6,5]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm2
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm4
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm11, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1]
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm0
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm3
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm6, %ymm0
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,1,0,3]
+; AVX2-FCP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm13, %ymm5
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm0
+; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm8
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm3
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm5, %xmm4
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm1
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm6, %ymm6
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm14, %xmm3
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm3
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm9, %ymm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm8, %ymm2
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm3[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1,2],xmm10[3],xmm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm9, %xmm10
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm0
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm15, %ymm3
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FCP-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm3 = mem[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[2,1,0,3]
+; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm4
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6],xmm13[7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm13, %ymm12
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,7,6,5]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm12
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm7
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,1,2,3]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2],xmm0[3],xmm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm7
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm0
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm13, %ymm4
+; AVX2-FCP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm0 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm4
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6],xmm0[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm4
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm1
-; AVX2-FCP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm4 = mem[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm10 = mem[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm1[1,2],xmm10[3],xmm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm6
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4],xmm6[5,6],xmm0[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm9, %ymm6
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm0 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm3 = mem[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
-; AVX2-FCP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm6 = mem[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm1
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm2
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm2 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm13, %ymm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm12, %xmm2
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm1
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm4
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7]
+; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm11
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm1
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm5
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm13
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm1
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm10
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm15
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm10
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm11
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm12
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm5
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5],xmm2[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm6
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm12
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm12
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6],ymm5[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm10
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm5[0,3,2,1]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm5
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm9
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm9 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm13
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm9[0,3,2,1]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm9
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm0
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm0
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, 96(%r8)
-; AVX2-FCP-NEXT:    vmovdqa %ymm9, 32(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%r8)
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm10, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm12, %xmm7
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm7
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm14, %xmm8
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 96(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 64(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 96(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 64(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 96(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 64(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, (%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm11, 96(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 32(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 64(%r8)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX2-FCP-NEXT:    vmovdqa %ymm10, 96(%r9)
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, 32(%r9)
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm9, 96(%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm5, 32(%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm4, 64(%r9)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm6, 32(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm11, (%rax)
+; AVX2-FCP-NEXT:    vmovdqa %ymm7, 96(%rax)
+; AVX2-FCP-NEXT:    vmovdqa %ymm1, 32(%rax)
+; AVX2-FCP-NEXT:    vmovdqa %ymm0, 64(%rax)
+; AVX2-FCP-NEXT:    vmovdqa %ymm6, (%rax)
 ; AVX2-FCP-NEXT:    addq $1304, %rsp # imm = 0x518
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
@@ -10396,7 +10369,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-LABEL: load_i16_stride6_vf64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    subq $1480, %rsp # imm = 0x5C8
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vmovdqa 608(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 576(%rdi), %ymm1
@@ -10408,17 +10381,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm20[0,2,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
 ; AVX512-NEXT:    vmovdqa 544(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm2
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm21
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512-NEXT:    vpshufb %xmm10, %xmm3, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm22
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10427,10 +10401,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm12, %xmm1
-; AVX512-NEXT:    vextracti32x4 $1, %ymm12, %xmm22
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm22[0,2,0,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
+; AVX512-NEXT:    vpshufb %xmm9, %xmm14, %xmm1
+; AVX512-NEXT:    vextracti32x4 $1, %ymm14, %xmm23
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm23[0,2,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3]
@@ -10438,55 +10412,56 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti128 $1, 480(%rdi), %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm28
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512-NEXT:    vmovdqa 736(%rdi), %ymm1
+; AVX512-NEXT:    vmovdqa 736(%rdi), %ymm0
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 704(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 704(%rdi), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm2
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm28
-; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-NEXT:    vpshufb %xmm10, %xmm13, %xmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm2
+; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm29
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
+; AVX512-NEXT:    vmovdqa 640(%rdi), %ymm1
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vinserti128 $1, 672(%rdi), %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm29
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vinserti128 $1, 672(%rdi), %ymm1, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0]
+; AVX512-NEXT:    vpshufb %ymm2, %ymm12, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm18
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX512-NEXT:    vextracti128 $1, %ymm15, %xmm8
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm8[0,2,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm14, %xmm1
+; AVX512-NEXT:    vpshufb %xmm9, %xmm15, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
 ; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm30
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm7
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
+; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX512-NEXT:    vpshufb %xmm10, %xmm7, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10496,99 +10471,101 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
 ; AVX512-NEXT:    vpshufb %xmm9, %xmm3, %xmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm9
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm2, %ymm2
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm11
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm11[0,2,0,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
-; AVX512-NEXT:    vpshufb %ymm5, %ymm8, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm6, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3,4],xmm6[5,6,7]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
-; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm2
-; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512-NEXT:    vpshufb %xmm10, %xmm4, %xmm10
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm0[3],xmm10[4,5],xmm0[6],xmm10[7]
+; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm0
+; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3]
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vinserti128 $1, 288(%rdi), %ymm2, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
+; AVX512-NEXT:    vinserti128 $1, 288(%rdi), %ymm0, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm31
 ; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
 ; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm15, %xmm0
-; AVX512-NEXT:    vpshufb %xmm10, %xmm14, %xmm14
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %xmm10, %xmm9, %xmm0
+; AVX512-NEXT:    vpshufb %xmm10, %xmm8, %xmm0
+; AVX512-NEXT:    vpshufb %xmm10, %xmm15, %xmm8
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm0[2],xmm8[3],xmm0[4,5],xmm8[6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb %xmm10, %xmm11, %xmm5
 ; AVX512-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm8, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %xmm14, %xmm6, %xmm0
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm27
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
-; AVX512-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm3
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm9, %ymm4
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm27
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm1
+; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
 ; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm2
 ; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm2
-; AVX512-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm0
-; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
 ; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm6
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3],xmm2[4,5],xmm6[6],xmm2[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm1
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm1
+; AVX512-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm2
 ; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
-; AVX512-NEXT:    vpshufb %xmm10, %xmm12, %xmm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm1
-; AVX512-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vpshufb %xmm10, %xmm14, %xmm5
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm1
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm12, %ymm1
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm12, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm26
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -10640,21 +10617,23 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd $36, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[0,0,0,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm4
-; AVX512-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm13 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512-NEXT:    vpshufb %ymm5, %ymm13, %ymm4
+; AVX512-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm8 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7]
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX512-NEXT:    vpshufb %ymm5, %ymm8, %ymm4
 ; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm24
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512-NEXT:    vpshufb %xmm5, %xmm8, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm25
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
@@ -10666,21 +10645,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[2,1,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm2[0,3,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[0,0,0,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,6,5,6,4]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm23
@@ -10699,109 +10678,111 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm2 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7]
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm1
+; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm23 ^ (zmm29 & (zmm2 ^ zmm23))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm22 & (zmm0 ^ zmm2))
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm14
+; AVX512-NEXT:    vpshufb %xmm14, %xmm2, %xmm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm23 ^ (zmm29 & (zmm0 ^ zmm23))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0))
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm0
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm2
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm2
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
+; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm1
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
 ; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm14
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm28
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4],xmm14[5,6],xmm1[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm28
 ; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,1,4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm0
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1,2],xmm1[3],xmm14[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
 ; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm0
 ; AVX512-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm13
-; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm13
-; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm25
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,5,6,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm1[4],xmm11[5,6],xmm1[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm8, %ymm12
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm8
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm8
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm25
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm28 ^ (zmm29 & (zmm0 ^ zmm28))
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm22 & (zmm25 ^ zmm0))
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm13[3,1,2,1,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,7,5,6,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm10[1,1,1,1,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5,6],xmm1[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[3,1,2,1,4,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2],xmm1[3],xmm6[4,5,6,7]
 ; AVX512-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm2, %ymm4
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm28
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm29 & (zmm2 ^ zmm0))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm22 & (zmm28 ^ zmm2))
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm28
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm29 & (zmm1 ^ zmm0))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm22 & (zmm28 ^ zmm1))
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm4
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7]
@@ -10815,7 +10796,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm22
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
 ; AVX512-NEXT:    vpshufb %xmm13, %xmm5, %xmm4
 ; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm20
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
@@ -10910,12 +10891,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm24[1,1,2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
 ; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm3
 ; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm0
@@ -11022,30 +11003,31 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-FCP-LABEL: load_i16_stride6_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $1416, %rsp # imm = 0x588
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT:    subq $1800, %rsp # imm = 0x708
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm16 = [6,5,4,7]
 ; AVX512-FCP-NEXT:    vmovdqa 608(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm0
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm21
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm16, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
 ; AVX512-FCP-NEXT:    vmovdqa 544(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm1
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm22
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm22
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
@@ -11055,31 +11037,30 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm1
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm2
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
+; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm16, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3]
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vinserti128 $1, 480(%rdi), %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 736(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 704(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm0
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm0
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm26
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
 ; AVX512-FCP-NEXT:    vmovdqa 640(%rdi), %ymm1
@@ -11088,33 +11069,33 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vinserti128 $1, 672(%rdi), %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm16
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm29
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm18
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm14, %xmm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm15, %xmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm16, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm1
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
 ; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm30
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm1
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm2
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
@@ -11124,516 +11105,511 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm0
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm16, %ymm12
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7]
-; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm31
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm0
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm0[3],xmm9[4,5],xmm0[6],xmm9[7]
+; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3]
+; AVX512-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm14
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2],xmm0[3],xmm14[4,5],xmm0[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm0[2],xmm8[3],xmm0[4,5],xmm8[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm0
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm28
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm12, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3],xmm2[4,5],xmm6[6],xmm2[7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm13, %xmm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm27 = [4,7,6,5]
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
+; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm27, %ymm15
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm28 = [10,1,10,3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
+; AVX512-FCP-NEXT:    vpermd 192(%rdi), %zmm28, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
-; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm21
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm20
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm24
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm22 = [2,9,8,11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm23 = [12,5,14,5]
+; AVX512-FCP-NEXT:    vpermd %zmm24, %zmm22, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpermd %zmm24, %zmm23, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,4]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm19
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm18
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm17
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm27, %ymm4
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512-FCP-NEXT:    vpermd (%rdi), %zmm28, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm6, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm18
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $219, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm7, %xmm16
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm6[0,1,2,1]
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm25
+; AVX512-FCP-NEXT:    vpermd %zmm25, %zmm22, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vpermd %zmm25, %zmm23, %zmm14
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm6
-; AVX512-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm7
+; AVX512-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm8 = mem[0,1],ymm7[2],mem[3],ymm7[4],mem[5,6],ymm7[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm22
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm30
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm7, %xmm21
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm29 & (zmm4 ^ zmm3))
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm26 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm26 & (zmm5 ^ zmm4))
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm20 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm20 & (zmm5 ^ zmm4))
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm3[2,1,2,3]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm27, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm17
+; AVX512-FCP-NEXT:    vpermd 576(%rdi), %zmm28, %zmm11
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[2,1,2,0,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm9, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,4]
+; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm26
+; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm22, %zmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm3
+; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm23, %zmm9
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,6,5,6,4]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm11
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm16
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm27, %ymm7
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm0
+; AVX512-FCP-NEXT:    vpermd 384(%rdi), %zmm28, %zmm6
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm5 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm1
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
+; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm28
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm22, %zmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm4, %xmm0
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm23, %zmm3
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm12
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm25
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm11 ^ (zmm29 & (zmm1 ^ zmm11))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm26 & (zmm25 ^ zmm1))
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm1
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm11
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm13
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm11, %zmm24
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm13
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm16 ^ (zmm29 & (zmm1 ^ zmm16))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1))
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512-FCP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm12 = mem[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm1[1,2],xmm12[3],xmm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm13
+; AVX512-FCP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm15 = mem[0,1,2,3,7,5,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5,6],xmm15[7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm12, %zmm13, %zmm16
 ; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm12
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm19
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm0
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm11[1,2],xmm13[3],xmm11[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm13
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm13
+; AVX512-FCP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm15 = mem[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm15
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6],xmm14[7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm15
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm15
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm23
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm24 ^ (zmm29 & (zmm0 ^ zmm24))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm26 & (zmm23 ^ zmm0))
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm0, %zmm10
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm10
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm12[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm8, %ymm13
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7],ymm13[8,9,10],ymm15[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm23
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm16 ^ (zmm29 & (zmm0 ^ zmm16))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm20 & (zmm23 ^ zmm0))
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1,2],xmm8[3],xmm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm8
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm8, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm8
 ; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm5
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm3
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm28
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm27
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm29 & (zmm5 ^ zmm0))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm26 & (zmm28 ^ zmm5))
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm0
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm3
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm24
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
-; AVX512-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm22
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm4, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm21
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm30
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm20 & (zmm27 ^ zmm5))
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm14
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm17
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm29) | ymm3
-; AVX512-FCP-NEXT:    movw $31, %ax
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm30 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm2
-; AVX512-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
+; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm15 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $146, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
+; AVX512-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm20
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm29 = [0,3,2,9]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm22
+; AVX512-FCP-NEXT:    vpermd %zmm24, %zmm29, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm16
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm16
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm5 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm12
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm30 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm18
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm30) | ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    movw $31, %ax
+; AVX512-FCP-NEXT:    kmovw %eax, %k1
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm16 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm4
+; AVX512-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5],mem[6],ymm4[7]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm20
+; AVX512-FCP-NEXT:    vpermd %zmm25, %zmm29, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5],xmm3[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm24
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm19
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm13 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm3
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm11
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm17
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm30) | ymm2
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm6 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm3
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm13
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm29) | ymm0
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm4 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm9
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm4 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm10, %xmm4
+; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm29, %zmm9
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm5
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm31 {%k1}
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm8 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm12
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm12
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm14, %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm17
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm29) | ymm10
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm10
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX512-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm29, %zmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4],xmm14[5],xmm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm14
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm28
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm1[0],xmm12[1],xmm1[2,3],xmm12[4],xmm1[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm14, %ymm18
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm30) | ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3],xmm1[4],xmm14[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm12
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm14, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm11
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm12
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm12, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm12, %xmm12
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm15
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm14, %ymm19
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm15
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm14
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm14, %ymm14
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm14
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm13, %xmm13
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm29) | ymm14
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm15
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm11
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm13
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6,7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = (ymm11 & ymm30) | ymm15
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm7
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm8, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm0, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm9, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm11, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm9
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
 ; AVX512-FCP-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rsi)
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm25, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm26 & (zmm4 ^ zmm30))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm26 & (zmm7 ^ zmm31))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm26 & (zmm11 ^ zmm0))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm26 & (zmm1 ^ zmm2))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 64(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm5, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rsi)
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm31, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm26, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm3 & (zmm6 ^ zmm16))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm8))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm3 & (zmm12 ^ zmm0))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm2))
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 64(%rcx)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 64(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, 64(%r8)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 64(%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512-FCP-NEXT:    addq $1416, %rsp # imm = 0x588
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, (%rax)
+; AVX512-FCP-NEXT:    addq $1800, %rsp # imm = 0x708
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride6_vf64:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    subq $840, %rsp # imm = 0x348
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vmovdqa 608(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 576(%rdi), %ymm1
@@ -11645,18 +11621,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm24[0,2,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
 ; AVX512DQ-NEXT:    vmovdqa 544(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm3, %xmm2
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm22
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %ymm1
@@ -11664,10 +11640,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm3
 ; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm15, %xmm2
-; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm15, %xmm21
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm3, %xmm2
+; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm3, %xmm20
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm20[0,2,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3]
@@ -11675,34 +11652,35 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti128 $1, 480(%rdi), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm3, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm3, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm19
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 640(%rdi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa 736(%rdi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 736(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 704(%rdi), %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 704(%rdi), %ymm4
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm4
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm19
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm15, %xmm1
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm18
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa 640(%rdi), %ymm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vinserti128 $1, 672(%rdi), %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3,4,5],ymm4[6],ymm1[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm27
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm18
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, 672(%rdi), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm13, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm13, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm28
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0))
 ; AVX512DQ-NEXT:    movw $-2048, %ax # imm = 0xF800
@@ -11713,21 +11691,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm8
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm8[0,2,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm14, %xmm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
 ; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm11
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4],xmm11[5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm12, %xmm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
@@ -11736,102 +11714,104 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm3, %xmm0
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm9
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm31
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm30
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm11[0,2,0,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
+; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm30
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm6, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm8
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4],xmm8[5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
-; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm2
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512DQ-NEXT:    vinserti128 $1, 288(%rdi), %ymm2, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm4, %xmm10
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm0[3],xmm10[4,5],xmm0[6],xmm10[7]
+; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm0
+; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3]
+; AVX512DQ-NEXT:    vinserti128 $1, 288(%rdi), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm26
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm29
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm17 ^ (zmm16 & (zmm7 ^ zmm17))
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm13, %xmm13
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm11, %xmm7
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm0
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm9, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm8, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm14, %xmm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm0[2],xmm8[3],xmm0[4,5],xmm8[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm7
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm11, %xmm7
 ; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm8, %xmm3
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm16 & (zmm6 ^ zmm0))
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm9, %ymm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm16 & (zmm6 ^ zmm5))
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
 ; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm2
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm2
 ; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm15, %xmm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm5
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm16 & (zmm4 ^ zmm0))
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm6
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm13, %ymm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm13, %xmm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm16 & (zmm5 ^ zmm1))
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
@@ -11882,21 +11862,23 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[0,0,0,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm13, %ymm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm8, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm27
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm8, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm28
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
@@ -11908,21 +11890,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[2,1,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm2[0,3,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[0,0,0,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,6,5,6,4]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm19
@@ -11941,100 +11923,102 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm28
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm19 ^ (zmm21 & (zmm2 ^ zmm19))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm18 & (zmm28 ^ zmm2))
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm2, %xmm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm28
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm19 ^ (zmm21 & (zmm0 ^ zmm19))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm18 & (zmm28 ^ zmm0))
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm14
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm20
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4],xmm14[5,6],xmm1[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm20
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm0
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1,2],xmm1[3],xmm14[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm27
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,5,6,5]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm1[4],xmm11[5,6],xmm1[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm8, %ymm12
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm27
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm21 & (zmm0 ^ zmm20))
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm18 & (zmm27 ^ zmm0))
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm13[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,7,5,6,5]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm10[1,1,1,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5,6],xmm1[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[3,1,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2],xmm1[3],xmm6[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm2, %ymm4
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm21 & (zmm2 ^ zmm0))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm18 & (zmm20 ^ zmm2))
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm20
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm21 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm18 & (zmm20 ^ zmm1))
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm13
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7]
@@ -12047,7 +12031,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm19
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm5, %xmm4
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm18
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
@@ -12141,12 +12125,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm21
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm0
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm13
@@ -12230,29 +12214,29 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride6_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $872, %rsp # imm = 0x368
+; AVX512DQ-FCP-NEXT:    subq $1320, %rsp # imm = 0x528
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm17 = [6,5,4,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 608(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm17, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 544(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm21
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm22
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
@@ -12263,13 +12247,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm2
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm21
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, 480(%rdi), %ymm1, %ymm1
@@ -12277,17 +12261,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm25
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 736(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 704(%rdi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm1
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm1
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm4
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm26
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 640(%rdi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -12295,13 +12279,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, 672(%rdi), %ymm2, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm26
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm27
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm14, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm28
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0))
@@ -12313,504 +12298,498 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm17, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm17, %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm10
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm0[3],xmm10[4,5],xmm0[6],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm30
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm17 ^ (zmm16 & (zmm6 ^ zmm17))
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm10
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm18 ^ (zmm16 & (zmm6 ^ zmm18))
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm13
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm13[2],xmm0[3],xmm13[4,5],xmm0[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm0[2],xmm8[3],xmm0[4,5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm9, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm16 & (zmm6 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3],xmm6[4,5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm16 & (zmm4 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm16 & (zmm7 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm14, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm16 & (zmm5 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm23 = [4,7,6,5]
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,1]
+; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm23, %ymm15
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm25 = [10,1,10,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd 192(%rdi), %zmm25, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $36, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm19
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm20
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [2,9,8,11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm19 = [12,5,14,5]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,4]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $219, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm23
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm22
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm24
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm23, %ymm4
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-FCP-NEXT:    vpermd (%rdi), %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm6 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm17
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm25
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm21
+; AVX512DQ-FCP-NEXT:    vpermd %zmm21, %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpermd %zmm21, %zmm19, %zmm14
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm26
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm17
+; AVX512DQ-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1],ymm7[2],mem[3],ymm7[4],mem[5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm31
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm29
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm20 & (zmm4 ^ zmm3))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm24 & (zmm4 ^ zmm3))
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm28 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm28 & (zmm5 ^ zmm4))
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm16
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm3[2,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm13[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm23, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-FCP-NEXT:    vpermd 576(%rdi), %zmm25, %zmm11
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[2,1,2,0,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm22
+; AVX512DQ-FCP-NEXT:    vpermd %zmm22, %zmm18, %zmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermd %zmm22, %zmm19, %zmm9
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,6,5,6,4]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm26
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm23, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermd 384(%rdi), %zmm25, %zmm6
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm5 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm23
+; AVX512DQ-FCP-NEXT:    vpermd %zmm23, %zmm18, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm23, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm27
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm20 & (zmm1 ^ zmm9))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm28 & (zmm27 ^ zmm1))
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm9, %zmm21
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm0[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0],xmm9[1,2],xmm15[3],xmm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm12
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm13
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm31
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm26 ^ (zmm24 & (zmm1 ^ zmm26))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm28 & (zmm31 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm18
+; AVX512DQ-FCP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm12 = mem[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm1[1,2],xmm12[3],xmm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm13
+; AVX512DQ-FCP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm15 = mem[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5,6],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm12, %zmm13, %zmm25
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm13, %xmm26
+; AVX512DQ-FCP-NEXT:    vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm15 = mem[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm15
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm19
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm21 ^ (zmm20 & (zmm1 ^ zmm21))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm1))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm13[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm8, %ymm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7],ymm13[8,9,10],ymm15[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm19
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm25 ^ (zmm24 & (zmm0 ^ zmm25))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1,2],xmm8[3],xmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm3
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3],xmm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm21
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm20 & (zmm1 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm28 & (zmm21 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm25
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm24 & (zmm5 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm28 & (zmm25 ^ zmm5))
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm0
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm28
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $109, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm17
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm26 = [0,3,2,9]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm18
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm26, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm14 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm15
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd $36, (%rsp), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm12 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,2,2,2,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (ymm0 & ymm20) | ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm17
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = (ymm0 & ymm20) | ymm2
 ; AVX512DQ-FCP-NEXT:    movw $31, %ax
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm0[1],ymm3[2,3,4,5],ymm0[6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm0, %zmm0, %zmm24 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm4 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm18
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm29
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm27
+; AVX512DQ-FCP-NEXT:    vpermd %zmm21, %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm26
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm30
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm6 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm5 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm25
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm31
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm4
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm24
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm8[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm8, %xmm29
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm8[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermd %zmm22, %zmm26, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm15
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4],xmm15[5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm22
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm10, %zmm0, %zmm22 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm21
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm13
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4],xmm13[5],xmm12[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm13
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm26
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm15
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm15
+; AVX512DQ-FCP-NEXT:    vpermd %zmm23, %zmm26, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5],xmm10[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm11
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm7[1],xmm12[2,3],xmm7[4],xmm12[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm20) | ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm7[1],xmm13[2,3],xmm7[4],xmm13[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm15
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm10, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm12, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4],xmm13[5],xmm12[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm15
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm20) | ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm6, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm5, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3],xmm14[4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm5 {%k1}
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 64(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 64(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm28 & (zmm23 ^ zmm22))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm28 & (zmm13 ^ zmm24))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm28 & (zmm10 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm28 & (zmm1 ^ zmm2))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, 64(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 64(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 64(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm24))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm2 & (zmm11 ^ zmm22))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm2 & (zmm12 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, 64(%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 64(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 64(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, 64(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, (%r9)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, (%rax)
-; AVX512DQ-FCP-NEXT:    addq $872, %rsp # imm = 0x368
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, (%rax)
+; AVX512DQ-FCP-NEXT:    addq $1320, %rsp # imm = 0x528
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 038c73bd9fed2..56bdda1d8e76a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -553,49 +553,49 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm3
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm4
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm6
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm5[2],xmm6[3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6],xmm2[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6],xmm4[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX2-NEXT:    vmovq %xmm1, (%rsi)
-; AVX2-NEXT:    vmovq %xmm6, (%rdx)
-; AVX2-NEXT:    vmovq %xmm3, (%rcx)
-; AVX2-NEXT:    vmovq %xmm4, (%r8)
-; AVX2-NEXT:    vmovq %xmm5, (%r9)
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-NEXT:    vmovq %xmm4, (%rcx)
+; AVX2-NEXT:    vmovq %xmm5, (%r8)
+; AVX2-NEXT:    vmovq %xmm6, (%r9)
 ; AVX2-NEXT:    vmovq %xmm7, (%r10)
 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
@@ -703,20 +703,20 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm3[2],xmm4[3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
@@ -805,20 +805,20 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm3[2],xmm4[3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
@@ -1283,8 +1283,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,4,7,7]
 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,2]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[0,1],xmm13[0,2]
 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
@@ -1303,7 +1302,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovdqa %xmm9, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm10, (%r8)
 ; AVX-NEXT:    vmovdqa %xmm11, (%r9)
-; AVX-NEXT:    vmovdqa %xmm12, (%r10)
+; AVX-NEXT:    vmovaps %xmm12, (%r10)
 ; AVX-NEXT:    vmovdqa %xmm0, (%rax)
 ; AVX-NEXT:    retq
 ;
@@ -1413,20 +1412,24 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm9[2,3]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3,4],xmm10[5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4,5],xmm10[6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm8
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
@@ -1448,15 +1451,15 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm12, %xmm14
 ; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
@@ -1465,7 +1468,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
 ; AVX2-FP-NEXT:    vmovdqa %xmm5, (%rsi)
 ; AVX2-FP-NEXT:    vmovdqa %xmm6, (%rdx)
-; AVX2-FP-NEXT:    vmovdqa %xmm8, (%rcx)
+; AVX2-FP-NEXT:    vmovdqa %xmm9, (%rcx)
 ; AVX2-FP-NEXT:    vmovdqa %xmm10, (%r8)
 ; AVX2-FP-NEXT:    vmovdqa %xmm11, (%r9)
 ; AVX2-FP-NEXT:    vmovdqa %xmm7, (%r10)
@@ -1492,20 +1495,24 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm9[2,3]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3,4],xmm10[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4,5],xmm10[6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
@@ -1527,15 +1534,15 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm14
 ; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
@@ -1544,7 +1551,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
 ; AVX2-FCP-NEXT:    vmovdqa %xmm5, (%rsi)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
-; AVX2-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %xmm9, (%rcx)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm10, (%r8)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm11, (%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm7, (%r10)
@@ -1641,75 +1648,79 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [8,1,2,3]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [4,1]
+; AVX512-FCP-NEXT:    vpermq %zmm0, %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm1[3],xmm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4],xmm8[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4],xmm7[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4],xmm6[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4,5],xmm7[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,10,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3,4],xmm7[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm1[6],xmm8[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm11
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm11
 ; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512-FCP-NEXT:    vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm12
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm11
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm10 = [0,5]
+; AVX512-FCP-NEXT:    vpermq %zmm0, %zmm10, %zmm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm1[0],xmm10[1,2,3,4,5,6],xmm1[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [8,1,10,11]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %xmm9, (%r9)
-; AVX512-FCP-NEXT:    vmovdqa %xmm10, (%r10)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %xmm9, (%r10)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -1803,75 +1814,79 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [8,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [4,1]
+; AVX512DQ-FCP-NEXT:    vpermq %zmm0, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm1[3],xmm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4],xmm8[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4],xmm7[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4],xmm6[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4,5],xmm7[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,10,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3,4],xmm7[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm1[6],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm11
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm11
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT:    vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm12
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm10 = [0,5]
+; AVX512DQ-FCP-NEXT:    vpermq %zmm0, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm1[0],xmm10[1,2,3,4,5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [8,1,10,11]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm10, (%r10)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, (%r10)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -2820,9 +2835,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
 ; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-NEXT:    vmovdqa 208(%rdi), %xmm12
-; AVX2-NEXT:    vmovdqa 192(%rdi), %xmm13
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
+; AVX2-NEXT:    vmovdqa 192(%rdi), %xmm12
+; AVX2-NEXT:    vmovdqa 208(%rdi), %xmm13
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5],xmm13[6],xmm12[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6]
 ; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
@@ -2844,7 +2859,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm13[0],xmm12[1],xmm13[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
@@ -2965,7 +2980,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm13, %xmm14
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm13, %xmm13
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
@@ -2989,9 +3004,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4],xmm12[5,6,7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vmovdqa 208(%rdi), %xmm12
-; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %xmm13
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
+; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %xmm12
+; AVX2-FP-NEXT:    vmovdqa 208(%rdi), %xmm13
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5],xmm13[6],xmm12[7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
@@ -3003,7 +3018,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm15, %xmm11
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3]
@@ -3013,7 +3028,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm13[0],xmm12[1],xmm13[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -3024,7 +3039,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
@@ -3132,7 +3147,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm12, %xmm14
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
@@ -3168,7 +3183,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm8
 ; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm8, %xmm8
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm15[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2,3,4,5,6,7],ymm13[8],ymm4[9,10,11,12,13,14,15]
@@ -3191,7 +3206,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
@@ -3384,150 +3399,149 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-FCP-LABEL: load_i16_stride7_vf16:
 ; AVX512-FCP:       # %bb.0:
+; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13]
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm3, %zmm9
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,5,9,0,12,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm3, %zmm7
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm3
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm14, %ymm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5],xmm15[6],xmm13[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm13, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm13
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6],xmm15[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %ymm7, %ymm9, %ymm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm10
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5],xmm9[6],xmm7[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [2,5,9,0,12,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm7
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %ymm7, %ymm14, %ymm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm14
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
-; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm14, %ymm14
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3,4,5],xmm9[6],xmm12[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %ymm7, %ymm9, %ymm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3,4,5],xmm7[6],xmm9[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[0,1,1,2]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [2,6,9,0,13,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm9, %zmm9
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %ymm9, %ymm12, %ymm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6,7],ymm9[8],ymm7[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [10,3,6,15,12,13,6,15]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm13, %zmm13
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %ymm9, %ymm14, %ymm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm14
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm5[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm10, %zmm10
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %ymm10, %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm17, %zmm14
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm10, %zmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %ymm13, %ymm14, %ymm13
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2,3,4,5,6,7],ymm13[8],ymm9[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm15, %ymm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm13
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,3,3,0,0,3,7,0]
-; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm14, %ymm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm14, %xmm12
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm16, %zmm2
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm13
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm12, %ymm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm10, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm12, %xmm14
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4,5,6,7],ymm11[8],ymm8[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,3,0,0,3,7,0]
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm11, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [2,5,9,12,2,5,9,12]
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm14, %zmm14
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm14, %ymm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm14
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,4,7,11,14,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm13, %zmm13
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
 ; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm12, %ymm5
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm12, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm5
 ; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,4,8,11,15,0,0,0]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm12, %zmm0
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [2,6,9,13,2,6,9,13]
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm13, %zmm1
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,4,8,11,15,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, (%r9)
-; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rax)
-; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %ymm11, (%r10)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -3705,150 +3719,149 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf16:
 ; AVX512DQ-FCP:       # %bb.0:
+; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm3, %zmm9
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,5,9,0,12,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm3, %zmm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm14, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5],xmm15[6],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %ymm7, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5],xmm9[6],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [2,5,9,0,12,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm7
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm7, %ymm14, %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm14, %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3,4,5],xmm9[6],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %ymm7, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3,4,5],xmm7[6],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[0,1,1,2]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [2,6,9,0,13,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm9, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %ymm9, %ymm12, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6,7],ymm9[8],ymm7[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [10,3,6,15,12,13,6,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm13, %zmm13
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm9, %ymm14, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm5[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm10, %zmm10
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm10, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm17, %zmm14
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm10, %zmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %ymm13, %ymm14, %ymm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2,3,4,5,6,7],ymm13[8],ymm9[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm15, %ymm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm13
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,3,3,0,0,3,7,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm14, %ymm13
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm14, %xmm12
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm16, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm13
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm12, %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm10, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm12, %xmm14
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4,5,6,7],ymm11[8],ymm8[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,3,3,0,0,3,7,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm14, %zmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm14, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,4,7,11,14,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm13, %zmm13
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
 ; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm12, %ymm5
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm12, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm5
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,4,8,11,15,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm12, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm13, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,4,8,11,15,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, (%r9)
-; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rax)
-; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, (%r10)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -5839,9 +5852,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa 432(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa 416(%rdi), %xmm0
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
+; AVX2-NEXT:    vmovdqa 416(%rdi), %xmm3
+; AVX2-NEXT:    vmovdqa 432(%rdi), %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5],xmm0[6],xmm3[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
@@ -5864,9 +5877,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX2-NEXT:    vmovdqa 208(%rdi), %xmm11
-; AVX2-NEXT:    vmovdqa 192(%rdi), %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7]
+; AVX2-NEXT:    vmovdqa 192(%rdi), %xmm11
+; AVX2-NEXT:    vmovdqa 208(%rdi), %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5],xmm1[6],xmm11[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6]
 ; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
@@ -5896,7 +5909,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
@@ -5922,7 +5935,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm8
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
@@ -6183,7 +6196,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm14 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm2, %xmm3
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
@@ -6243,18 +6256,18 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
 ; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm8
-; AVX2-FP-NEXT:    vmovdqa 432(%rdi), %xmm3
-; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %xmm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
+; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %xmm3
+; AVX2-FP-NEXT:    vmovdqa 432(%rdi), %xmm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5],xmm0[6],xmm3[7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
 ; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm14
-; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX2-FP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
@@ -6266,9 +6279,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm10
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm8, %xmm10
-; AVX2-FP-NEXT:    vmovdqa 208(%rdi), %xmm15
-; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %xmm12
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7]
+; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %xmm15
+; AVX2-FP-NEXT:    vmovdqa 208(%rdi), %xmm12
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,5],xmm12[6],xmm15[7]
 ; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm5, %xmm5
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -6284,7 +6297,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm13, %xmm14
 ; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3]
@@ -6297,7 +6310,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm6
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
@@ -6309,7 +6322,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm10
-; AVX2-FP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
@@ -6324,7 +6337,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm12[0],xmm15[1],xmm12[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
@@ -6582,7 +6595,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -6609,7 +6622,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -6640,9 +6653,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm2 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm15
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, %ymm9
@@ -6672,8 +6685,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm8
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
@@ -6698,7 +6711,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
@@ -6764,8 +6777,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm8
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm12
-; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
+; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm4[2],ymm8[3,4,5],ymm4[6],ymm8[7]
@@ -6774,8 +6787,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vporq %ymm0, %ymm1, %ymm21
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
+; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm11
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
@@ -6799,7 +6812,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa 288(%rdi), %ymm6
 ; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm14
 ; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
@@ -6808,7 +6821,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
@@ -6820,7 +6833,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm1, %zmm22
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -6829,7 +6842,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vporq %ymm1, %ymm2, %ymm19
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
@@ -6845,7 +6858,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm27
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm1, %zmm23
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm8[1],ymm4[2,3,4],ymm8[5],ymm4[6,7]
@@ -6856,7 +6869,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vporq %ymm1, %ymm2, %ymm20
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[0,1,1,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
@@ -6868,7 +6881,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpsrlq $48, %xmm14, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm18
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
@@ -6903,8 +6916,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm7
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,8,9,4,5,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
@@ -6921,10 +6933,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
@@ -6948,7 +6959,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm28
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
@@ -6978,7 +6989,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm24
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm9
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5,6,7]
@@ -6994,7 +7005,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm16
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15]
@@ -7011,18 +7022,18 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm15[0,1,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3],ymm2[4,5,6,7,8,9,10],ymm10[11],ymm2[12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm14
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3,4,5],xmm14[6],xmm10[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm15[0,1,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5,6,7,8,9,10],ymm11[11],ymm2[12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm14
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm14[1],xmm11[2,3,4,5],xmm14[6],xmm11[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
-; AVX512-NEXT:    vpor %ymm2, %ymm10, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
+; AVX512-NEXT:    vpor %ymm2, %ymm11, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
@@ -7039,7 +7050,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
 ; AVX512-NEXT:    vpor %ymm2, %ymm9, %ymm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2,3,4,5,6],ymm9[7,8],ymm2[9,10,11,12,13,14],ymm9[15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7]
@@ -7051,14 +7062,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6,7,8],ymm11[9],ymm9[10,11,12,13,14,15]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm11
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6,7,8],ymm10[9],ymm9[10,11,12,13,14,15]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm10
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm12
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm28, %zmm0, %zmm14
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm24, %zmm0, %zmm25
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm26
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,4,6,7]
@@ -7087,7 +7098,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm4 & (zmm11 ^ zmm21))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm4 & (zmm10 ^ zmm21))
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm5 ^ (mem & (zmm22 ^ zmm5))
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm4 & (zmm12 ^ zmm22))
@@ -7099,8 +7110,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm4 & (zmm27 ^ zmm20))
 ; AVX512-NEXT:    vmovdqa32 %zmm25, %zmm27 {%k1}
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm18 ^ (zmm4 & (zmm1 ^ zmm18))
-; AVX512-NEXT:    vmovdqa32 %zmm10, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm11, (%rsi)
+; AVX512-NEXT:    vmovdqa32 %zmm11, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm10, (%rsi)
 ; AVX512-NEXT:    vmovdqa64 %zmm12, (%rdx)
 ; AVX512-NEXT:    vmovdqa64 %zmm23, (%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm27, (%r8)
@@ -7118,312 +7129,317 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-FCP-LABEL: load_i16_stride7_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm29
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm31
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13]
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm22
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13]
-; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm9
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm0, %zmm13
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [1,0,0,0,4,8,11,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm0, %zmm4
-; AVX512-FCP-NEXT:    vpermd %zmm9, %zmm16, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14]
-; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15]
-; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm5, %zmm5
-; AVX512-FCP-NEXT:    vpermd %zmm22, %zmm19, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm28
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm28[0,1,0,2]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm15, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vporq %ymm8, %ymm11, %ymm23
-; AVX512-FCP-NEXT:    vpbroadcastw 252(%rdi), %xmm8
-; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm31
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm26
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm30
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm30[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [3,6,10,13,3,6,10,13]
+; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm21, %zmm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vporq %ymm6, %ymm7, %ymm24
+; AVX512-FCP-NEXT:    vpbroadcastw 252(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %xmm11
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm7, %zmm23 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm14
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3],xmm14[4],xmm11[5],xmm14[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm26
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm3, %zmm24 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3],xmm8[4],xmm3[5],xmm8[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,7,11,14]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm8, %zmm8
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4,5,6],xmm8[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm25
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12]
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm18, %zmm3
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm20 = [4,5,6,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm25
 ; AVX512-FCP-NEXT:    vmovdqa 240(%rdi), %xmm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vporq %ymm4, %ymm0, %ymm20
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm4[2],ymm11[3,4,5],ymm4[6],ymm11[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0],xmm12[1],xmm14[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm24
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm21
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3,4,5],xmm3[6],xmm8[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,5,9,0,12,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm8, %zmm8
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpor %ymm3, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm15[0,1,2,3],xmm10[4],xmm15[5],xmm10[6],xmm15[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm12, %xmm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm27
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,8,11,15]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm12, %zmm12
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5,6],xmm12[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm28
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [2,6,9,13,2,6,9,13]
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm0, %zmm10
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm20, %zmm28
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,6,9,0,13,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm10, %zmm10
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vporq %ymm10, %ymm12, %ymm23
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5],xmm10[6],xmm12[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
-; AVX512-FCP-NEXT:    vpermd %ymm28, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
-; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm18, %zmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6],xmm3[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %zmm9, %zmm10, %zmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [10,3,6,15,12,13,6,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [1,0,0,0,5,8,12,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm18, %zmm13
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm13, %ymm2
-; AVX512-FCP-NEXT:    vpermd %zmm9, %zmm19, %zmm9
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2],xmm2[3],xmm9[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vporq %ymm3, %ymm2, %ymm18
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm28[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm19
-; AVX512-FCP-NEXT:    vmovdqa 416(%rdi), %ymm13
-; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm13[2],ymm3[3,4,5],ymm13[6],ymm3[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,11,2,11,12,5,8,9]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm10, %zmm10
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm30[0,1,1,2]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm10, %zmm16
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm15
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm15[4],xmm10[5],xmm15[6],xmm10[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [1,0,0,0,5,8,12,15]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm13, %zmm13
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpor %ymm13, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm17
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm21, %zmm1
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm20, %zmm17
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm13
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2],xmm10[3],xmm13[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vporq %ymm1, %ymm10, %ymm20
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3,4,5],xmm10[6],xmm1[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm30[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm1, %zmm21
+; AVX512-FCP-NEXT:    vmovdqa 416(%rdi), %ymm15
+; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,11,2,11,12,5,8,9]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm12, %zmm12
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
-; AVX512-FCP-NEXT:    vpor %ymm10, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vpbroadcastw 232(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpsrlq $48, %xmm14, %xmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpermd %zmm22, %zmm16, %zmm4
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm16
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0]
-; AVX512-FCP-NEXT:    vpermd %ymm28, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
-; AVX512-FCP-NEXT:    vpsrld $16, %xmm12, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm25
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm0
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm10, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm30 & (zmm16 ^ zmm0))
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7]
+; AVX512-FCP-NEXT:    vpor %ymm12, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm19
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm0
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm9, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm22
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2],xmm3[3],xmm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm13, %ymm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7]
+; AVX512-FCP-NEXT:    vpbroadcastw 232(%rdi), %xmm8
+; AVX512-FCP-NEXT:    vpsrlq $48, %xmm14, %xmm9
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm3, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm29 & (zmm8 ^ zmm0))
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3,4,5],xmm9[6],xmm3[7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,0,0,0,6,9,13,0]
-; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm10, %zmm10
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
-; AVX512-FCP-NEXT:    vpor %ymm4, %ymm10, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm10
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,4,7,11,14,0,0,0]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm26
-; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm10, %zmm10
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm10[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm27
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0]
-; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm28, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [2,6,9,13,2,6,9,13]
-; AVX512-FCP-NEXT:    vpermd %zmm22, %zmm9, %zmm9
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [2,0,0,0,6,9,13,0]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm9, %zmm9
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
+; AVX512-FCP-NEXT:    vpor %ymm3, %ymm9, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0]
+; AVX512-FCP-NEXT:    vpermd %ymm30, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm18, %zmm9
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
+; AVX512-FCP-NEXT:    vpsrld $16, %xmm11, %xmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm18
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,7,11,14,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm3[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,4,7,0,0,4,7,0]
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermd %ymm30, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13]
+; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm10, %zmm10
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm10, %ymm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm10
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm11
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm10
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2],xmm1[3],xmm9[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,0,0,0,6,10,13,0]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm0, %zmm12
-; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm9, %zmm9
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
-; AVX512-FCP-NEXT:    vpor %ymm1, %ymm9, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,4,8,11,15,0,0,0]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm9
-; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm4, %zmm0
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm13
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2],xmm0[3],xmm13[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [3,0,0,0,6,10,13,0]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm13, %zmm13
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3,4,5,6,7],ymm13[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,8,11,15,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm3, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm4, %zmm4
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm3 & (zmm26 ^ zmm23))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm20 ^ (mem & (zmm24 ^ zmm20))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm3 & (zmm27 ^ zmm24))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm30 & (zmm15 ^ zmm21))
-; AVX512-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm28, %zmm15 {%k1}
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm30 & (zmm19 ^ zmm18))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm12, %zmm19 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm9, %zmm16 {%k1}
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm25 ^ (zmm30 & (zmm11 ^ zmm25))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm11 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%r9)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm10 ^ (zmm30 & (zmm0 ^ zmm10))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm4 & (zmm3 ^ zmm24))
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm5 ^ (mem & (zmm27 ^ zmm5))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm0, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm4 & (zmm5 ^ zmm27))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm29 & (zmm16 ^ zmm23))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
+; AVX512-FCP-NEXT:    kmovw %edi, %k1
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm4, %zmm16 {%k1}
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm20))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm10, %zmm21 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm11, %zmm8 {%k1}
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm18 ^ (zmm29 & (zmm12 ^ zmm18))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm12 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm9 ^ (zmm29 & (zmm2 ^ zmm9))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride7_vf32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm15
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm7
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u]
@@ -7444,22 +7460,22 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512DQ-NEXT:    vpbroadcastw 252(%rdi), %xmm3
 ; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm13
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[0,1,0,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm18 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm6
-; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm5
+; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm5
+; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm15
 ; AVX512DQ-NEXT:    vmovdqa 240(%rdi), %xmm14
 ; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm7[2],ymm10[3,4],ymm7[5],ymm10[6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm7
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -7475,11 +7491,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm1, %zmm21
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -7495,19 +7511,19 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm26
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm1, %zmm22
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm7[3],ymm10[4,5],ymm7[6],ymm10[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm26
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vporq %ymm1, %ymm2, %ymm17
+; AVX512DQ-NEXT:    vporq %ymm1, %ymm2, %ymm27
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm0[0,1,1,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -7527,138 +7543,138 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm7
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5],xmm2[6],xmm7[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm29
+; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm17
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm30
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
 ; AVX512DQ-NEXT:    vpsrld $16, %xmm13, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm27
+; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm28
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm28
+; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm29
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm24
-; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm12
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7]
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
+; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm11
 ; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,1]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6],xmm2[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm5
 ; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %ymm13
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,8,9,4,5,8,9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm30
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm31
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm6[3],ymm15[4,5],ymm6[6],ymm15[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6],ymm3[7,8],ymm2[9,10,11,12,13,14],ymm3[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6],xmm2[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm31
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm16
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~mem)
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & ~mem)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm25 & (zmm22 ^ zmm19))
 ; AVX512DQ-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm22 {%k1}
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm22 {%k1}
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm2
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm19
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm1, %zmm19
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm23
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm4
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm13[2],ymm5[3,4,5],ymm13[6],ymm5[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2],ymm1[3,4],ymm14[5],ymm1[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,1,2,0]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpor %ymm3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm25 & (zmm19 ^ zmm17))
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1}
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm23
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm26
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm25 & (zmm19 ^ zmm27))
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm19 {%k1}
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2],ymm7[3,4,5],ymm10[6],ymm7[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm27
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm6[2],ymm12[3,4],ymm6[5],ymm12[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm13[3],ymm5[4,5],ymm13[6],ymm5[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm1[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm11[0,1,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7,8,9,10],ymm15[11],ymm3[12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm12
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2,3,4,5],xmm12[6],xmm15[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
@@ -7668,16 +7684,16 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
 ; AVX512DQ-NEXT:    vpor %ymm3, %ymm12, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm20 ^ (zmm25 & (zmm11 ^ zmm20))
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm20 ^ (zmm25 & (zmm1 ^ zmm20))
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7,8,9,10,11],ymm12[12],ymm3[13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm20
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm15
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3],xmm15[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
@@ -7687,33 +7703,32 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
 ; AVX512DQ-NEXT:    vpor %ymm3, %ymm12, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7]
-; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm26
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1,2,3,4,5,6],ymm12[7,8],ymm2[9,10,11,12,13,14],ymm12[15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm0, %zmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm0, %zmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm9
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm15
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm15[0],xmm0[1],xmm15[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5,6,7],ymm12[8,9,10,11,12],ymm7[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm4
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7],ymm12[8,9,10,11,12],ymm4[13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm12
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm15
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm12, %zmm7, %zmm27
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm12, %zmm4, %zmm17
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm15
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7]
@@ -7723,48 +7738,49 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm24 ^ (zmm25 & (zmm2 ^ zmm24))
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm17, %zmm0, %zmm2 {%k1}
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,4,6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm26, %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6,7,8],ymm7[9],ymm3[10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,4,6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2],xmm4[3],xmm10[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm10
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2],xmm7[3],xmm10[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,3,1]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm27 ^ (zmm25 & (zmm3 ^ zmm27))
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm1 & (zmm8 ^ zmm18))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm0 ^ (mem & (zmm21 ^ zmm0))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm21))
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4,5],ymm11[6],ymm14[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,3,1]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm17 ^ (zmm25 & (zmm3 ^ zmm17))
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm18))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm4 ^ (mem & (zmm21 ^ zmm4))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm0 & (zmm9 ^ zmm21))
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%rsi)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm22, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm19, (%r8)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%r9)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -7774,295 +7790,290 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm30
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm22
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm18, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermd %zmm22, %zmm19, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm25
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm25[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm12, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm29
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm13
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vporq %ymm8, %ymm11, %ymm23
-; AVX512DQ-FCP-NEXT:    vpbroadcastw 252(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm30
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm30[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm29 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm29, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vporq %ymm6, %ymm7, %ymm24
+; AVX512DQ-FCP-NEXT:    vpbroadcastw 252(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %xmm11
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512DQ-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm7, %zmm23 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3],xmm14[4],xmm11[5],xmm14[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
-; AVX512DQ-FCP-NEXT:    vmovdqa 240(%rdi), %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vporq %ymm10, %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm24
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm4, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm25, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm31
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm16
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm20, %zmm14
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm17, %zmm14
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm28
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [10,3,6,15,12,13,6,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,5,8,12,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm0
-; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm20, %zmm12
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm12, %ymm0
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm14, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm19 & (zmm16 ^ zmm9))
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm3, %zmm24 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3],xmm8[4],xmm3[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,7,11,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4,5,6],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm25
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm21 = [4,5,6,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm21, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa 240(%rdi), %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3,4,5],xmm3[6],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,5,9,0,12,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vporq %ymm8, %ymm3, %ymm22
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm10
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm15[0,1,2,3],xmm10[4],xmm15[5],xmm10[6],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm27
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,8,11,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5,6],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm28
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm10
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm21, %zmm28
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,6,9,0,13,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vporq %ymm10, %ymm12, %ymm23
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5],xmm10[6],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm30[0,1,1,2]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm10, %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm15
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm15[4],xmm10[5],xmm15[6],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [1,0,0,0,5,8,12,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm13, %zmm13
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %ymm13, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm29, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm21, %zmm18
 ; AVX512DQ-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm16 {%k1}
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 416(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm12[4],xmm9[5],xmm12[6],xmm9[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm14, %zmm14
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4,5,6,7],ymm14[8,9,10],ymm9[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm2))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm9, %zmm0, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm12
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2],xmm10[3],xmm13[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm10, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3,4,5],xmm13[6],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm30[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm21
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm17 & (zmm21 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vmovdqa 416(%rdi), %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,11,2,11,12,5,8,9]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %ymm12, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm9, %zmm0, %zmm21 {%k1}
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm10
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm9, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm14, %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastw 232(%rdi), %xmm10
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm15, %xmm11
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm19 & (zmm10 ^ zmm12))
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,6,9,13,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm10, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2],xmm3[3],xmm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastw 232(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm14, %xmm9
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm17 & (zmm3 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3,4,5],xmm0[6],xmm8[7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %ymm12, %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT:    vpermd %zmm22, %zmm18, %zmm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [2,0,0,0,6,9,13,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %ymm9, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm25, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm12, %ymm11
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpsrld $16, %xmm13, %xmm11
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,4,7,11,14,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm11, %zmm11
+; AVX512DQ-FCP-NEXT:    vpermd %ymm30, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm19, %zmm9
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpsrld $16, %xmm11, %xmm8
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm19
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm25, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm22, %zmm17, %zmm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7],ymm9[8,9,10,11,12],ymm2[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm13
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2],xmm9[3],xmm13[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [3,0,0,0,6,10,13,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm13, %zmm13
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,4,7,11,14,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm8[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3,4,5],xmm9[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,4,7,0,0,4,7,0]
+; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm30, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm20, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5,6,7],ymm12[8,9,10,11,12],ymm9[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %ymm13, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm12 ^ (zmm19 & (zmm11 ^ zmm12))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1}
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [3,0,0,0,6,10,13,0]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %ymm12, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm19 ^ (zmm17 & (zmm8 ^ zmm19))
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,4,8,11,15,0,0,0]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm9
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm4, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm19 & (zmm0 ^ zmm2))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm9 ^ (zmm17 & (zmm2 ^ zmm9))
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm23))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm21 ^ (mem & (zmm24 ^ zmm21))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm24))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm24))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm22 ^ (mem & (zmm27 ^ zmm22))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm27))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm23))
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm18, %zmm0, %zmm16 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, (%r9)
-; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -12052,9 +12063,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vmovdqa 656(%rdi), %xmm13
-; AVX2-NEXT:    vmovdqa 640(%rdi), %xmm14
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7]
+; AVX2-NEXT:    vmovdqa 640(%rdi), %xmm13
+; AVX2-NEXT:    vmovdqa 656(%rdi), %xmm14
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5],xmm14[6],xmm13[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -12082,11 +12093,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vmovdqa 432(%rdi), %xmm4
+; AVX2-NEXT:    vmovdqa 416(%rdi), %xmm4
 ; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa 416(%rdi), %xmm5
+; AVX2-NEXT:    vmovdqa 432(%rdi), %xmm5
 ; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5],xmm5[6],xmm4[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6]
 ; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -12114,9 +12125,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm6
-; AVX2-NEXT:    vmovdqa 880(%rdi), %xmm1
-; AVX2-NEXT:    vmovdqa 864(%rdi), %xmm0
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX2-NEXT:    vmovdqa 864(%rdi), %xmm1
+; AVX2-NEXT:    vmovdqa 880(%rdi), %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5],xmm0[6],xmm1[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
@@ -12142,9 +12153,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm5
-; AVX2-NEXT:    vmovdqa 208(%rdi), %xmm15
-; AVX2-NEXT:    vmovdqa 192(%rdi), %xmm3
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm15[6],xmm3[7]
+; AVX2-NEXT:    vmovdqa 192(%rdi), %xmm15
+; AVX2-NEXT:    vmovdqa 208(%rdi), %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5],xmm3[6],xmm15[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
@@ -12178,7 +12189,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -12207,7 +12218,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    # ymm9 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm10
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm9, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
@@ -12236,8 +12247,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm9
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4],xmm7[5],xmm9[6,7]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-NEXT:    vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm4 = mem[0],xmm4[1],mem[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
@@ -12262,7 +12273,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm7
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
@@ -12797,7 +12808,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm12 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
@@ -12913,11 +12924,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
 ; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vmovdqa 656(%rdi), %xmm3
+; AVX2-FP-NEXT:    vmovdqa 640(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 640(%rdi), %xmm2
+; AVX2-FP-NEXT:    vmovdqa 656(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -12926,9 +12937,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm4, %xmm8
-; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm4 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
@@ -12945,11 +12956,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4],xmm8[5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vmovdqa 432(%rdi), %xmm9
+; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %xmm9
 ; AVX2-FP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %xmm8
+; AVX2-FP-NEXT:    vmovdqa 432(%rdi), %xmm8
 ; AVX2-FP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,5],xmm8[6],xmm9[7]
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
@@ -12974,9 +12985,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm11
-; AVX2-FP-NEXT:    vmovdqa 880(%rdi), %xmm1
-; AVX2-FP-NEXT:    vmovdqa 864(%rdi), %xmm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX2-FP-NEXT:    vmovdqa 864(%rdi), %xmm1
+; AVX2-FP-NEXT:    vmovdqa 880(%rdi), %xmm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,5],xmm0[6],xmm1[7]
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm15, %xmm15
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5,6,7],ymm11[8,9,10,11,12],ymm15[13,14,15]
@@ -12998,9 +13009,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm11
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2],xmm11[3],xmm8[4],xmm11[5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm8, %xmm8
-; AVX2-FP-NEXT:    vmovdqa 208(%rdi), %xmm14
-; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %xmm5
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,5],xmm14[6],xmm5[7]
+; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %xmm14
+; AVX2-FP-NEXT:    vmovdqa 208(%rdi), %xmm5
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5],xmm5[6],xmm14[7]
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -13032,8 +13043,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FP-NEXT:    vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm7 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm7 = mem[0],xmm4[1],mem[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
 ; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -13045,7 +13056,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm9
-; AVX2-FP-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
@@ -13063,7 +13074,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm12
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
@@ -13091,8 +13102,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-FP-NEXT:    vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm9 = mem[0],xmm9[1],mem[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm9, %xmm9
@@ -13115,7 +13126,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1],xmm14[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -13638,7 +13649,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm10 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm3
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
@@ -13693,7 +13704,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
@@ -13755,9 +13766,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,4,7,3,6,0,0,0]
@@ -13783,7 +13794,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
 ; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm13, %xmm13
 ; AVX2-FCP-NEXT:    vmovdqa %xmm4, %xmm6
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm2 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
@@ -13838,8 +13849,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
@@ -13862,7 +13873,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm7 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm10
-; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm10, %xmm10
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
@@ -13997,48 +14008,47 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-LABEL: load_i16_stride7_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $1864, %rsp # imm = 0x748
-; AVX512-NEXT:    vmovdqa 480(%rdi), %ymm1
+; AVX512-NEXT:    subq $1800, %rsp # imm = 0x708
+; AVX512-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX512-NEXT:    vmovdqa 448(%rdi), %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm11[2],ymm2[3,4,5],ymm11[6],ymm2[7]
 ; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm16
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm18
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-NEXT:    vmovdqa 512(%rdi), %ymm13
+; AVX512-NEXT:    vmovdqa 512(%rdi), %ymm12
 ; AVX512-NEXT:    vmovdqa 544(%rdi), %ymm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0],ymm13[1],ymm10[2,3,4],ymm13[5],ymm10[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
 ; AVX512-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpbroadcastw 700(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa64 672(%rdi), %xmm22
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm22[0,1,0,3]
+; AVX512-NEXT:    vmovdqa64 672(%rdi), %xmm21
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm21[0,1,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm14
-; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm15
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm5[2],ymm8[3,4,5],ymm5[6],ymm8[7]
-; AVX512-NEXT:    vmovdqa %ymm5, %ymm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm6[2],ymm8[3,4,5],ymm6[6],ymm8[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm11
-; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm21
-; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm21[0,1,0,2]
+; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm14
+; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm19
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm19[0,1,0,2]
+; AVX512-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpbroadcastw 252(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm4
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
@@ -14046,81 +14056,85 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
+; AVX512-NEXT:    vmovdqa %ymm7, %ymm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
 ; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm17
-; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm19
+; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm18
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX512-NEXT:    vpor %ymm0, %ymm5, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7]
-; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm20
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7]
+; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm20
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm23
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm5
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
 ; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7]
 ; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm0
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0],xmm4[1],xmm0[2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm23
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm7, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 528(%rdi), %xmm7
-; AVX512-NEXT:    vmovdqa %ymm10, %ymm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
+; AVX512-NEXT:    vmovdqa %ymm10, %ymm13
+; AVX512-NEXT:    vmovdqa %ymm12, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15]
 ; AVX512-NEXT:    vpshufb %ymm3, %ymm7, %ymm3
 ; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm10
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm11
+; AVX512-NEXT:    vmovdqa %ymm11, %ymm12
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
-; AVX512-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 576(%rdi), %ymm0
+; AVX512-NEXT:    vpor %ymm3, %ymm6, %ymm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 576(%rdi), %ymm11
 ; AVX512-NEXT:    vmovdqa 608(%rdi), %ymm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2],ymm0[3,4,5],ymm6[6],ymm0[7]
-; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm26
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7]
+; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm22
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm6
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
 ; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX512-NEXT:    vmovdqa64 640(%rdi), %ymm16
-; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm25 = ymm16[0,1,0,2]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm25[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-NEXT:    vmovdqa 688(%rdi), %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 688(%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm21
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
-; AVX512-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm22
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm26
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm15
+; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm14
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm2
 ; AVX512-NEXT:    vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm1
-; AVX512-NEXT:    vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm1
+; AVX512-NEXT:    vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
@@ -14130,41 +14144,41 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpor %ymm6, %ymm8, %ymm6
 ; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7]
-; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm19
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm8
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm21[0,1,1,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm19[0,1,1,2]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm9, %zmm8, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
+; AVX512-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15]
 ; AVX512-NEXT:    vpshufb %ymm7, %ymm8, %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7]
-; AVX512-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7]
+; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
 ; AVX512-NEXT:    vpshufb %ymm5, %ymm8, %ymm5
 ; AVX512-NEXT:    vpor %ymm7, %ymm5, %ymm5
 ; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm9
-; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7]
+; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm30
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm7
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
@@ -14173,68 +14187,71 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm11
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm28
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm0[3],ymm15[4,5],ymm0[6],ymm15[7]
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm5, %ymm6, %ymm8
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm7, %ymm7
-; AVX512-NEXT:    vpor %ymm7, %ymm8, %ymm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
+; AVX512-NEXT:    vpor %ymm7, %ymm8, %ymm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
 ; AVX512-NEXT:    vpshufb %ymm5, %ymm7, %ymm5
 ; AVX512-NEXT:    vpor %ymm5, %ymm6, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm15
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm10
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm10
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm21[0,1,1,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm19[0,1,1,3]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,1,4,5,6,5]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
 ; AVX512-NEXT:    vpbroadcastw 232(%rdi), %xmm6
-; AVX512-NEXT:    vpsrlq $48, %xmm23, %xmm7
+; AVX512-NEXT:    vpsrlq $48, %xmm24, %xmm7
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm12
-; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm13
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm13
+; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm16[0,1,1,3]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512-NEXT:    vpbroadcastw 680(%rdi), %xmm5
-; AVX512-NEXT:    vpsrlq $48, %xmm3, %xmm6
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512-NEXT:    vpsrlq $48, %xmm21, %xmm6
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm5, %zmm1, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vinserti32x4 $2, %xmm5, %zmm1, %zmm27
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm11
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
@@ -14247,13 +14264,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15]
-; AVX512-NEXT:    vmovdqa %xmm4, %xmm2
+; AVX512-NEXT:    vmovdqa %xmm4, %xmm15
 ; AVX512-NEXT:    vpsrld $16, %xmm4, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm4
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm8
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3],xmm5[4],xmm8[5,6,7]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
@@ -14265,12 +14282,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6]
 ; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm11
-; AVX512-NEXT:    vpsrld $16, %xmm28, %xmm9
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
+; AVX512-NEXT:    vmovdqa %xmm3, %xmm12
+; AVX512-NEXT:    vpsrld $16, %xmm3, %xmm9
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm9, %zmm8, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7]
@@ -14281,28 +14299,28 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm7, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm25[0,1,2,1,4,5,6,5]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
 ; AVX512-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
+; AVX512-NEXT:    vinserti32x4 $2, (%rsp), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm17
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm19
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
@@ -14312,20 +14330,21 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 704(%rdi), %ymm7
-; AVX512-NEXT:    vmovdqa 736(%rdi), %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7]
+; AVX512-NEXT:    vmovdqa 704(%rdi), %ymm6
+; AVX512-NEXT:    vmovdqa 736(%rdi), %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm22
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
 ; AVX512-NEXT:    vmovdqa 800(%rdi), %ymm3
 ; AVX512-NEXT:    vmovdqa 768(%rdi), %ymm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
@@ -14334,150 +14353,150 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa 832(%rdi), %ymm3
-; AVX512-NEXT:    vmovdqa 864(%rdi), %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm21
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa 832(%rdi), %ymm9
+; AVX512-NEXT:    vmovdqa 864(%rdi), %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm16
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,10,11,8,9,4,5,8,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm2, %xmm3
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7]
-; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm22
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm23
+; AVX512-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm24[0,1,2,1,4,5,6,5]
+; AVX512-NEXT:    vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm2 = mem[0,1,2,1,4,5,6,5]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm2
-; AVX512-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm3
+; AVX512-NEXT:    vmovdqa 288(%rdi), %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
-; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm14
-; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6],ymm0[7,8,9,10,11,12,13],ymm13[14],ymm0[15]
+; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm15
+; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6],ymm0[7,8,9,10,11,12,13],ymm12[14],ymm0[15]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm1
-; AVX512-NEXT:    vmovdqa 416(%rdi), %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm12
+; AVX512-NEXT:    vmovdqa 416(%rdi), %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm12, %xmm7
+; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm12
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm2[3],ymm15[4,5],ymm2[6],ymm15[7]
-; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm24
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm12
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6],xmm12[7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15]
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
-; AVX512-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4,5,6],xmm12[7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-NEXT:    vmovdqa %ymm1, %ymm15
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
-; AVX512-NEXT:    vextracti32x4 $1, %ymm12, %xmm25
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4],xmm7[5],xmm0[6],xmm7[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm12[0],ymm7[1,2,3,4,5,6],ymm12[7,8],ymm7[9,10,11,12,13,14],ymm12[15]
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm7, %ymm7
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4,5,6],xmm7[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm18
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512-NEXT:    vextracti32x4 $1, %ymm7, %xmm25
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,10,11,10,11,6,7,8,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm7
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm25[2,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm0[4],xmm11[5],xmm0[6],xmm11[7]
-; AVX512-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm11, %ymm1
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm11
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm11
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4],xmm7[5],xmm0[6],xmm7[7]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0],ymm4[1],ymm10[2,3],ymm4[4],ymm10[5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm12[0],ymm7[1,2,3,4,5,6],ymm12[7,8],ymm7[9,10,11,12,13,14],ymm12[15]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm7, %ymm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm21
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm22
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm11
+; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm14
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7]
-; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm0 & ~ymm20)
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7]
-; AVX512-NEXT:    vmovdqa %ymm15, %ymm13
-; AVX512-NEXT:    vextracti32x4 $1, %ymm0, %xmm25
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7,8],ymm0[9],ymm2[10,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa %ymm3, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm14[2],ymm3[3,4,5],ymm14[6],ymm3[7]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4],xmm2[5],xmm7[6],xmm2[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm25)
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm12
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm13
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm25[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm16
+; AVX512-NEXT:    vmovdqa %ymm4, %ymm10
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm11
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
-; AVX512-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ~ymm20)
+; AVX512-NEXT:    vmovdqa %ymm11, %ymm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7]
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ~ymm25)
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm11
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm0
 ; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -14485,39 +14504,40 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[0,1,2,0,4,5,6,4]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-NEXT:    vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm1 = mem[0,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa %ymm2, %ymm15
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5],ymm14[6],ymm2[7]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm31
+; AVX512-NEXT:    vmovdqa %ymm14, %ymm13
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7]
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm18
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm3[2],ymm13[3,4,5],ymm3[6],ymm13[7]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm19
-; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm26
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm11
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa %ymm10, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm12, %ymm12
-; AVX512-NEXT:    vpor %ymm0, %ymm12, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm3[2],ymm12[3,4,5],ymm3[6],ymm12[7]
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm17
+; AVX512-NEXT:    vmovdqa %ymm12, %ymm14
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm11
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm21
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[1,1,2,0]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
+; AVX512-NEXT:    vpor %ymm7, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm0
 ; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -14525,164 +14545,162 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[0,1,2,0,4,5,6,4]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-NEXT:    vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm1 = mem[0,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm25
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm24
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm23
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
+; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm23
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm24
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm21
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm22
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm15
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm22
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
-; AVX512-NEXT:    vmovdqa %ymm8, %ymm5
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm4
-; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm5
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm4
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4],xmm2[5],xmm7[6],xmm2[7]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
-; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm29
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm30
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm11
+; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6],ymm11[7,8,9,10,11,12,13],ymm0[14],ymm11[15]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm28
-; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6],ymm2[7,8,9,10,11,12,13],ymm0[14],ymm2[15]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm7
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512-NEXT:    vmovdqa %ymm10, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[0,1,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm26
-; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm27
-; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
+; AVX512-NEXT:    vmovdqa %ymm11, %ymm12
+; AVX512-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm21[0,1,0,1]
+; AVX512-NEXT:    vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3],ymm2[4,5,6,7,8,9,10],ymm7[11],ymm2[12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm11
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3,4,5],xmm11[6],xmm7[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm11, %ymm11
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm13
-; AVX512-NEXT:    vpor %ymm11, %ymm13, %ymm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm26
+; AVX512-NEXT:    vpor %ymm2, %ymm7, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm10[2],ymm15[3,4,5],ymm10[6],ymm15[7]
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm29
-; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm2[2],ymm11[3,4,5],ymm2[6],ymm11[7]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm28
+; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7,8,9,10,11,12,13],ymm0[14],ymm3[15]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm30
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm11
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm19 & (zmm3 ^ zmm0))
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX512-NEXT:    vmovdqa %ymm4, %ymm14
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm18
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm12
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6],ymm2[7,8,9,10,11,12,13],ymm0[14],ymm2[15]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm21 & (zmm27 ^ zmm0))
+; AVX512-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm20
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm27
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm4
-; AVX512-NEXT:    vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm21[0,1,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3],ymm3[4,5,6,7,8,9,10],ymm11[11],ymm3[12,13,14,15]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm6
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm11
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm11[1],xmm3[2,3,4,5],xmm11[6],xmm3[7]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm15[0,1,0,1]
+; AVX512-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7,8,9,10],ymm3[11],ymm2[12,13,14,15]
+; AVX512-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm6
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5],xmm7[6],xmm3[7]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-NEXT:    vpor %ymm2, %ymm1, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7]
-; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm24
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm23
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm17
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm21
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm22
+; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm26
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm23
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm24
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm22
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
+; AVX512-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm10
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm8
+; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm10
+; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
-; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm9
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm9
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm14[2,3],ymm9[4,5],ymm14[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
@@ -14691,40 +14709,38 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX512-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7]
-; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm20
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7]
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm19
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
 ; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm15
+; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm28
-; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm31
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm11
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2],xmm3[3],xmm11[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm4
-; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm29
+; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm30
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7]
+; AVX512-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512-NEXT:    vpor %ymm0, %ymm3, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm1[3],ymm15[4,5],ymm1[6],ymm15[7]
-; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm16
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm13
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
+; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm28
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
 ; AVX512-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm29
-; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm13
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm7
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
@@ -14732,93 +14748,93 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm19 & (zmm30 ^ zmm0))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm21 & (zmm16 ^ zmm0))
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, (%rsp), %zmm0, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 32-byte Folded Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm5
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[0,1,3,1]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512-NEXT:    vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm5[0,1,3,1]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
-; AVX512-NEXT:    vpshufb %ymm5, %ymm7, %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb %ymm5, %ymm8, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4,5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6,7,8],ymm8[9],ymm6[10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
-; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7]
-; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3,4,5,6,7,8],ymm9[9],ymm6[10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7]
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm19 & (zmm13 ^ zmm6))
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm7
-; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm21 & (zmm13 ^ zmm6))
+; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
+; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm8
-; AVX512-NEXT:    vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm8 = mem[0,1],ymm8[2],mem[3,4,5],ymm8[6],mem[7]
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm8
+; AVX512-NEXT:    vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm8 = ymm8[0,1],mem[2],ymm8[3,4,5],mem[6],ymm8[7]
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm3 = mem ^ (zmm9 & (zmm3 ^ mem))
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload
 ; AVX512-NEXT:    # zmm10 = mem ^ (zmm9 & (zmm10 ^ mem))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm12 = mem ^ (zmm9 & (zmm12 ^ mem))
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload
 ; AVX512-NEXT:    # zmm1 = zmm1 ^ (zmm9 & (zmm1 ^ mem))
-; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ mem))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm9 & (zmm2 ^ zmm10))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm9 & (zmm4 ^ zmm12))
+; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm26 = zmm26 ^ (zmm9 & (zmm26 ^ mem))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm9 & (zmm2 ^ zmm3))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm9 & (zmm4 ^ zmm10))
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1]
 ; AVX512-NEXT:    vpshufb %ymm5, %ymm8, %ymm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7]
@@ -14830,55 +14846,53 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, (%rsp), %zmm0, %zmm14 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm23
+; AVX512-NEXT:    vinserti64x4 $1, %ymm20, %zmm0, %zmm23
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm12 = zmm12 ^ (zmm19 & (zmm12 ^ mem))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm3 = zmm3 ^ (zmm21 & (zmm3 ^ mem))
 ; AVX512-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm12 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm6 = zmm6 ^ (zmm19 & (zmm6 ^ mem))
-; AVX512-NEXT:    vmovdqa32 %zmm7, %zmm6 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm7
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm6 = zmm6 ^ (zmm19 & (zmm6 ^ mem))
-; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm6 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm25 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm25 = zmm25 ^ (zmm19 & (zmm25 ^ mem))
+; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm3 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm3 = zmm3 ^ (zmm21 & (zmm3 ^ mem))
+; AVX512-NEXT:    vmovdqa32 %zmm7, %zmm3 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm31 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm31 = zmm31 ^ (zmm21 & (zmm31 ^ mem))
+; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm31 {%k1}
+; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm25 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm25 = zmm25 ^ (zmm21 & (zmm25 ^ mem))
 ; AVX512-NEXT:    vmovdqa32 %zmm9, %zmm25 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm6 = mem ^ (zmm19 & (zmm6 ^ mem))
-; AVX512-NEXT:    vmovdqa32 %zmm10, %zmm6 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm3, (%rsi)
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm3 = mem ^ (zmm21 & (zmm3 ^ mem))
+; AVX512-NEXT:    vmovdqa32 %zmm10, %zmm3 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm26, (%rsi)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, 64(%rsi)
 ; AVX512-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdx)
 ; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%rcx)
-; AVX512-NEXT:    vmovdqa64 %zmm12, (%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm6, (%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm25, 64(%r8)
-; AVX512-NEXT:    vmovdqa64 %zmm8, (%r8)
-; AVX512-NEXT:    vmovdqa64 %zmm6, (%r9)
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa32 %zmm14, %zmm2 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm31, (%r8)
+; AVX512-NEXT:    vmovdqa64 %zmm3, (%r9)
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm1 = mem ^ (zmm19 & (zmm1 ^ mem))
-; AVX512-NEXT:    vmovdqa32 %zmm15, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm2, 64(%r9)
+; AVX512-NEXT:    vmovdqa32 %zmm14, %zmm1 {%k1}
+; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm12 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm12 = mem ^ (zmm21 & (zmm12 ^ mem))
+; AVX512-NEXT:    vmovdqa32 %zmm15, %zmm12 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, 64(%r9)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm11 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm11 = mem ^ (zmm19 & (zmm11 ^ mem))
-; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512-NEXT:    vmovdqa64 %zmm12, (%rax)
+; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm11 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm11 = mem ^ (zmm21 & (zmm11 ^ mem))
+; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512-NEXT:    vmovdqa32 %zmm23, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm11 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, 64(%rax)
@@ -14887,741 +14901,781 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm0
 ; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT:    addq $1864, %rsp # imm = 0x748
+; AVX512-NEXT:    addq $1800, %rsp # imm = 0x708
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride7_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $1800, %rsp # imm = 0x708
-; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm22
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm26
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    subq $2120, %rsp # imm = 0x848
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm16
+; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm17
+; AVX512-FCP-NEXT:    vmovdqa 480(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 448(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm23
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
-; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vpermd %zmm22, %zmm3, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa 480(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 448(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm21
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm23
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vporq %ymm4, %ymm6, %ymm17
-; AVX512-FCP-NEXT:    vmovdqa 672(%rdi), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm7, %xmm20
-; AVX512-FCP-NEXT:    vpbroadcastw 700(%rdi), %xmm7
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm28 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm31
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm31[0,1,0,2]
+; AVX512-FCP-NEXT:    vpermd %zmm17, %zmm3, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpor %ymm4, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 672(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
+; AVX512-FCP-NEXT:    vpbroadcastw 700(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm21
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm21[0,1,0,2]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm3
-; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm16
-; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm14, %xmm2
-; AVX512-FCP-NEXT:    vpbroadcastw 252(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 240(%rdi), %xmm15
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm18
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm19
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm13
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vporq %ymm2, %ymm0, %ymm18
+; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %xmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
+; AVX512-FCP-NEXT:    vpbroadcastw 252(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 240(%rdi), %xmm12
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm15, %xmm7
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7]
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm0[2],ymm11[3,4,5],ymm0[6],ymm11[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm8
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-FCP-NEXT:    vmovdqa %xmm10, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm20
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vpermd %zmm22, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpor %ymm1, %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm6
+; AVX512-FCP-NEXT:    vpermd %zmm17, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm6, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 608(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm4[2],ymm5[3,4,5],ymm4[6],ymm5[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7]
-; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %ymm25
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm25[0,1,0,2]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-FCP-NEXT:    vmovdqa 688(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [2,6,9,0,13,0,0,0]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm11
-; AVX512-FCP-NEXT:    vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm29
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm23, %zmm9
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vpor %ymm9, %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm26
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm18 = [2,5,2,5,2,5,2,5]
-; AVX512-FCP-NEXT:    vpermd %ymm31, %ymm18, %ymm12
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm7, %xmm20
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm14, %xmm19
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm14
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 608(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm6[2],ymm8[3,4,5],ymm6[6],ymm8[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5],xmm9[6],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %ymm23
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm23[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT:    vmovdqa 688(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm15
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm2[0],xmm15[1],xmm2[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [2,6,9,0,13,0,0,0]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm14, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm14
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm12, %ymm10
-; AVX512-FCP-NEXT:    vpermd %zmm22, %zmm23, %zmm7
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
-; AVX512-FCP-NEXT:    vpor %ymm6, %ymm10, %ymm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm23
-; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm4
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpermd %ymm25, %ymm18, %ymm7
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm18
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm9
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm6, %zmm1
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
+; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm19, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm25
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm30
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15]
-; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm1, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vpor %ymm10, %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm31[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm9, %ymm13
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm13, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm24
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm26
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm29
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm21[0,1,1,2]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15,16,17,18,19,20,21,22,23,16,17,18,19,16,17,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm27
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm12, %xmm22
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2],ymm5[3,4,5],ymm0[6],ymm5[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6],xmm5[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2],xmm8[3],xmm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vpermd %zmm22, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm6
-; AVX512-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [1,0,0,0,5,8,12,15]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm14, %ymm14
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm14, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm12
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3,4,5],xmm14[6],xmm5[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermd %zmm17, %zmm19, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vpor %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm30
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm19
+; AVX512-FCP-NEXT:    vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm23[0,1,1,2]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm27
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 704(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa 736(%rdi), %ymm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm23
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm20
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm8[4],xmm5[5],xmm8[6],xmm5[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm21, %zmm8
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm8, %ymm8
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm8, %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [10,3,6,15,12,13,6,15]
+; AVX512-FCP-NEXT:    vpermd %zmm25, %zmm3, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm9, %ymm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0],ymm2[1],ymm10[2,3],ymm2[4],ymm10[5,6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3,4,5],xmm9[6],xmm5[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm29[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm9, %ymm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4,5,6],ymm14[7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm14, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2],xmm1[3],xmm14[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermd %zmm17, %zmm3, %zmm12
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm12, %ymm10
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm10, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm25[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm8, %ymm10
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3,4,5],xmm10[6],xmm1[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm10
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm19[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm10, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-FCP-NEXT:    vpbroadcastw 232(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm5
-; AVX512-FCP-NEXT:    vpsrlq $48, %xmm20, %xmm9
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm3, %zmm27
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm21
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [3,6,10,13,3,6,10,13]
-; AVX512-FCP-NEXT:    vpermd %zmm21, %zmm24, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm28, %zmm4, %zmm17 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT:    vpbroadcastw 232(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm4
+; AVX512-FCP-NEXT:    vpsrlq $48, %xmm22, %xmm9
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm2, %zmm22
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-FCP-NEXT:    vpbroadcastw 680(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm9
-; AVX512-FCP-NEXT:    vpsrlq $48, %xmm18, %xmm2
+; AVX512-FCP-NEXT:    vpsrlq $48, %xmm27, %xmm2
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm26
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7]
-; AVX512-FCP-NEXT:    vpermd %ymm31, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm1, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm15
-; AVX512-FCP-NEXT:    vpsrld $16, %xmm19, %xmm8
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm6, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 736(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm18
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4],xmm6[5],xmm8[6,7]
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm30
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,4,7,11,14]
-; AVX512-FCP-NEXT:    vpermd %zmm30, %zmm6, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3,4,5,6],xmm13[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm13, %ymm7
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm24, %zmm13
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm13, %ymm11
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7]
+; AVX512-FCP-NEXT:    vpermd %ymm29, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm30
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm0[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [2,5,9,12,2,5,9,12]
+; AVX512-FCP-NEXT:    vpermd %zmm8, %zmm15, %zmm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7],ymm6[8,9,10,11,12],ymm5[13,14,15]
+; AVX512-FCP-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm29
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,6,10,13,3,6,10,13]
+; AVX512-FCP-NEXT:    vpermd %zmm17, %zmm9, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
+; AVX512-FCP-NEXT:    kmovw %eax, %k1
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 {%k1} # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3],xmm10[4],xmm5[5],xmm10[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,4,7,11,14]
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm16, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4,5,6],xmm10[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm5
+; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm15, %zmm14
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm14, %ymm14
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [4,5,6,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm4, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm7
+; AVX512-FCP-NEXT:    vpermd %zmm8, %zmm9, %zmm14
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm14, %ymm11
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm16 {%k1} # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermd %ymm25, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 {%k1} # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3],xmm11[4],xmm7[5],xmm11[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm16, %zmm11
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3,4,5,6],xmm1[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm15, %zmm7
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm4, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm19
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm11
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5],xmm7[6],xmm11[7]
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,0,0,0,4,8,11,15]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm16, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3,4,5,6],xmm14[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [2,6,9,13,2,6,9,13]
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm24, %zmm14
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm14, %ymm14
+; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm4, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm21
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm13
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm11
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm16, %zmm13
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm2
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm24, %zmm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpermd %zmm21, %zmm1, %zmm7
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpsrld $16, %xmm14, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
-; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm31, %ymm12, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm17, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm7, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm9, %zmm1
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm0
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm7
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm11
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3],xmm11[4],xmm13[5],xmm11[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm16
-; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm6, %zmm6
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm10
-; AVX512-FCP-NEXT:    vpermd %zmm10, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,0,0,0,4,8,11,15]
-; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm11, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3,4,5,6],xmm13[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %zmm10, %zmm17, %zmm13
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm13
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm1[4],xmm13[5],xmm1[6],xmm13[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpermd %zmm30, %zmm11, %zmm6
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm17, %zmm3
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm15[2],ymm4[3,4,5],ymm15[6],ymm4[7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm15, %ymm13
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15]
-; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm3, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
-; AVX512-FCP-NEXT:    vpor %ymm6, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpermd %zmm10, %zmm24, %zmm6
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm10
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpermd %zmm30, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm0
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermd %ymm25, %ymm12, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpermd %zmm21, %zmm17, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm9, %zmm1
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7]
+; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512-FCP-NEXT:    vpermd %zmm17, %zmm15, %zmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpsrld $16, %xmm15, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm7
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 416(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,11,2,11,12,5,8,9]
-; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm3, %zmm6
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0]
+; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermd %ymm30, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512-FCP-NEXT:    vpermd %zmm8, %zmm24, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm4
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm8, %xmm27
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 416(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2,3,4,5],xmm5[6],xmm9[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [2,11,2,11,12,5,8,9]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm10, %zmm11
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpor %ymm6, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 864(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm11, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 864(%rdi), %ymm4
 ; AVX512-FCP-NEXT:    vmovdqa 832(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm21
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpermd %zmm30, %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm23
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4,5],ymm4[6],ymm5[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm19
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm30
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm9
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm10, %zmm10
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm11
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpor %ymm10, %ymm11, %ymm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermd %zmm17, %zmm24, %zmm6
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,3,7,10,14,0,0,0]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermd %zmm24, %zmm5, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm29 & (zmm27 ^ zmm1))
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm12
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm22
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm17
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,3,7,10,14,0,0,0]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermd %zmm27, %zmm6, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm20 & (zmm22 ^ zmm1))
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm7
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3,4,5],xmm1[6],xmm7[7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm22
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [2,0,0,0,6,9,13,0]
-; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm20, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm13, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,0,0,0,6,9,13,0]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm3, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vpor %ymm7, %ymm10, %ymm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm13
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm3
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm7
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermd %zmm25, %zmm5, %zmm3
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm29 & (zmm26 ^ zmm1))
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpermd %zmm30, %zmm20, %zmm3
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm20
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm20 & (zmm2 ^ zmm1))
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3,4,5],xmm1[6],xmm6[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm3, %zmm6
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpor %ymm6, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm23
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm26
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm15, %ymm18
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm19
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm27
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm25
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm24
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [0,4,7,11,14,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm24, %zmm21, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,7,11,14,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm27, %zmm1, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm30
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm21
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm19
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm9
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0],ymm2[1],ymm9[2,3,4],ymm2[5],ymm9[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm13
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2],xmm6[3],xmm13[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm16
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm17
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm9
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3,4,5],xmm9[6],xmm0[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [3,0,0,0,6,10,13,0]
-; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm18, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpor %ymm1, %ymm6, %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm31
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm22
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermd %zmm25, %zmm21, %zmm1
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [3,0,0,0,6,10,13,0]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm27, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vpor %ymm10, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm26
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm10
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm9, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm28
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm27
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, (%rsp), %zmm0, %zmm21 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpermd %zmm30, %zmm18, %zmm8
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm8, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm12
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1],xmm12[2],xmm8[3],xmm12[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm9
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm27, %zmm4
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm22
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm21
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm23
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,8,11,15,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm24, %zmm3, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm8
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,1,10,3,14,7,10,3]
-; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm9, %zmm15
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,4,8,11,15,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm30, %zmm2, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm5, %xmm8
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [0,1,10,3,14,7,10,3]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm27, %zmm15
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
 ; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm15, %ymm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0],ymm7[1,2],ymm15[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0],ymm5[1,2],ymm15[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm13
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm13, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7],ymm5[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm14
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm14, %xmm4
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermd %zmm25, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm10
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm11
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,3,1,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermd %zmm30, %zmm9, %zmm9
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-FCP-NEXT:    vpermd %zmm29, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm5 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm10 = mem ^ (zmm5 & (zmm10 ^ mem))
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm12 = mem ^ (zmm9 & (zmm12 ^ mem))
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm12 = mem ^ (zmm5 & (zmm12 ^ mem))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm5 = zmm5 ^ (zmm11 & (zmm5 ^ mem))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm14 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm14 = zmm14 ^ (zmm11 & (zmm14 ^ mem))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm11 & (zmm15 ^ zmm10))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm11 & (zmm16 ^ zmm12))
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm6
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,3,1,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm27, %zmm11
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm13 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm13 = mem ^ (zmm9 & (zmm13 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm17 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm17 = zmm17 ^ (zmm9 & (zmm17 ^ mem))
-; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm21 = zmm21 ^ (zmm9 & (zmm21 ^ mem))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm9 & (zmm5 ^ zmm12))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm9 & (zmm11 ^ zmm13))
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1,2],ymm8[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm19 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm19 = zmm19 ^ (zmm29 & (zmm19 ^ mem))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm13 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm13 = zmm13 ^ (zmm20 & (zmm13 ^ mem))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, (%rsp), %zmm0, %zmm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm0, %zmm16
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm18
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm10
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm0, %zmm9
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm19 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm29 & (zmm2 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm9, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm8 = zmm8 ^ (zmm29 & (zmm8 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm10, %zmm8 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm18, %zmm13 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm19 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm19 = zmm19 ^ (zmm20 & (zmm19 ^ mem))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm18, %zmm19 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm18 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm18 = zmm18 ^ (zmm20 & (zmm18 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm6, %zmm18 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm6 = zmm6 ^ (zmm29 & (zmm6 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm12, %zmm6 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm13, %zmm9 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm15, %zmm10 {%k1}
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm1 = mem ^ (zmm29 & (zmm1 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm18, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 64(%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm6 = zmm6 ^ (zmm20 & (zmm6 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm6 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm11, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm12, %zmm8 {%k1}
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm1 = mem ^ (zmm20 & (zmm1 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm9, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 64(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 64(%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm1 = mem ^ (zmm29 & (zmm1 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm16, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm1 = mem ^ (zmm20 & (zmm1 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm10, %zmm1 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm4 = mem ^ (zmm29 & (zmm4 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm2 = mem ^ (zmm20 & (zmm2 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm4, %zmm2 {%k1}
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rax)
-; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm0 = mem ^ (zmm29 & (zmm0 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm7, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm0 = mem ^ (zmm20 & (zmm0 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512-FCP-NEXT:    addq $1800, %rsp # imm = 0x708
+; AVX512-FCP-NEXT:    addq $2120, %rsp # imm = 0x848
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -15629,128 +15683,130 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    subq $1592, %rsp # imm = 0x638
 ; AVX512DQ-NEXT:    vmovdqa 480(%rdi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm30
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm29
+; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %ymm15
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm1[2],ymm15[3,4,5],ymm1[6],ymm15[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm23
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vmovdqa 512(%rdi), %ymm12
-; AVX512DQ-NEXT:    vmovdqa 544(%rdi), %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm17
+; AVX512DQ-NEXT:    vmovdqa 512(%rdi), %ymm13
+; AVX512DQ-NEXT:    vmovdqa 544(%rdi), %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vporq %ymm3, %ymm2, %ymm26
+; AVX512DQ-NEXT:    vporq %ymm3, %ymm2, %ymm29
 ; AVX512DQ-NEXT:    vpbroadcastw 700(%rdi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa 672(%rdi), %xmm14
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3]
+; AVX512DQ-NEXT:    vmovdqa 672(%rdi), %xmm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm11[0,1,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm8
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm7
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm7
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm13
-; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm18
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm18[0,1,0,2]
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm17
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm17[0,1,0,2]
+; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastw 252(%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm15
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,3]
+; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
-; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm4
-; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm16
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm21
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm21
-; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm19
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm19
+; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm16
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm5, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm18
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm5
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm10[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-NEXT:    vmovdqa 240(%rdi), %xmm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0],xmm15[1],xmm10[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 240(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm28
+; AVX512DQ-NEXT:    vmovdqa %xmm1, %xmm10
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm7, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 528(%rdi), %xmm7
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
+; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm12
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm7, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm26
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
 ; AVX512DQ-NEXT:    vpor %ymm3, %ymm6, %ymm3
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 576(%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqa 608(%rdi), %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm23
+; AVX512DQ-NEXT:    vmovdqa 576(%rdi), %ymm14
+; AVX512DQ-NEXT:    vmovdqa 608(%rdi), %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm2[2],ymm14[3,4,5],ymm2[6],ymm14[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm23
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm6
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %ymm22
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm20 = ymm22[0,1,0,2]
+; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %ymm25
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm22 = ymm25[0,1,0,2]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm20[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm22[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
 ; AVX512DQ-NEXT:    vmovdqa 688(%rdi), %xmm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0],xmm14[1],xmm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm17
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0],xmm11[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm14
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm2
+; AVX512DQ-NEXT:    vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7]
+; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm15
+; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm1
+; AVX512DQ-NEXT:    vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-NEXT:    vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
@@ -15759,65 +15815,64 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
 ; AVX512DQ-NEXT:    vpor %ymm6, %ymm8, %ymm3
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm9, %ymm19
-; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm16
+; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm8
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm18[0,1,1,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm17[0,1,1,2]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm21
-; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm24
 ; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm3[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm9, %zmm8, %zmm3
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm8, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm13
-; AVX512DQ-NEXT:    vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm10
-; AVX512DQ-NEXT:    vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm10
+; AVX512DQ-NEXT:    vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
 ; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm8, %ymm5
 ; AVX512DQ-NEXT:    vpor %ymm7, %ymm5, %ymm3
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm15
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm15[3],ymm2[4,5],ymm15[6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm9
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm14, %ymm16
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm7
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm22[0,1,1,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm25[0,1,1,2]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm6
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm19
 ; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm3
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm15[3],ymm2[4,5],ymm15[6],ymm2[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm8
@@ -15825,49 +15880,52 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm7, %ymm7
 ; AVX512DQ-NEXT:    vpor %ymm7, %ymm8, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0],ymm13[1],ymm10[2,3,4],ymm13[5],ymm10[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0],ymm10[1],ymm4[2,3,4],ymm10[5],ymm4[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
 ; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm7, %ymm5
 ; AVX512DQ-NEXT:    vpor %ymm5, %ymm6, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm11
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm16 = ymm18[0,1,1,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[0,1,1,3]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm16[0,1,2,1,4,5,6,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,1,4,5,6,5]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
 ; AVX512DQ-NEXT:    vpbroadcastw 232(%rdi), %xmm6
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm8
-; AVX512DQ-NEXT:    vpsrlq $48, %xmm21, %xmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm2
+; AVX512DQ-NEXT:    vpsrlq $48, %xmm28, %xmm7
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm3
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm22[0,1,1,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm26 = ymm25[0,1,1,3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm29[0,1,2,1,4,5,6,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm26[0,1,2,1,4,5,6,5]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512DQ-NEXT:    vpbroadcastw 680(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpsrlq $48, %xmm25, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm10
+; AVX512DQ-NEXT:    vpsrlq $48, %xmm24, %xmm5
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm12
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm12
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4],xmm3[5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
@@ -15880,14 +15938,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm0
-; AVX512DQ-NEXT:    vpsrld $16, %xmm24, %xmm7
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm13
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm7, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm0
+; AVX512DQ-NEXT:    vpsrld $16, %xmm21, %xmm7
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm13
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm7, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm7
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3],xmm3[4],xmm7[5,6,7]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
@@ -15899,12 +15956,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm9
-; AVX512DQ-NEXT:    vpsrld $16, %xmm17, %xmm8
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm2
+; AVX512DQ-NEXT:    vpsrld $16, %xmm19, %xmm8
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm7, %zmm7
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7]
@@ -15920,23 +15977,23 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm6, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm6
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm20[0,1,2,1,4,5,6,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm22[0,1,2,1,4,5,6,5]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
 ; AVX512DQ-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 {%k1} # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm18
-; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm25
+; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 {%k1} # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm19
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm27
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm6
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7]
 ; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
@@ -15946,430 +16003,428 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 704(%rdi), %ymm7
+; AVX512DQ-NEXT:    vmovdqa 704(%rdi), %ymm8
 ; AVX512DQ-NEXT:    vmovdqa 736(%rdi), %ymm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
 ; AVX512DQ-NEXT:    vmovdqa 800(%rdi), %ymm3
 ; AVX512DQ-NEXT:    vmovdqa 768(%rdi), %ymm5
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm16
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 832(%rdi), %ymm9
-; AVX512DQ-NEXT:    vmovdqa 864(%rdi), %ymm8
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa 832(%rdi), %ymm15
+; AVX512DQ-NEXT:    vmovdqa 864(%rdi), %ymm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm15[2],ymm9[3,4],ymm15[5],ymm9[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [8,9,10,11,8,9,4,5,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm3
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm20
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm21
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm24
+; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm25
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm28[0,1,2,1,4,5,6,5]
+; AVX512DQ-NEXT:    vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,2,1,4,5,6,5]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm11
-; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm13
+; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm11
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
-; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm14
+; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm7
 ; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6],ymm10[7,8,9,10,11,12,13],ymm12[14],ymm10[15]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm10, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6],ymm0[7,8,9,10,11,12,13],ymm12[14],ymm0[15]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4,5,6],xmm1[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm12[6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm0
+; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm12, %xmm10
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7]
 ; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm19
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15]
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm12, %ymm12
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5,6],xmm12[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm23
-; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm4
-; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm12, %xmm28
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm28[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5],xmm10[6],xmm11[7]
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm11, %ymm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4,5,6],xmm0[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm29
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm10
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm4[4],xmm10[5],xmm4[6],xmm10[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6],ymm12[7,8],ymm10[9,10,11,12,13,14],ymm12[15]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm10, %ymm10
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3,4,5,6],xmm10[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm10, %xmm22
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,10,11,10,11,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm22[2,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm10
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm4[4],xmm10[5],xmm4[6],xmm10[7]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm13
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6],ymm12[7,8],ymm10[9,10,11,12,13,14],ymm12[15]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm10, %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3,4,5,6],xmm1[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm15[2,3],ymm9[4,5],ymm15[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm0, %ymm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm21
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4,5,6,7,8],ymm0[9],ymm10[10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm13
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm15[2],ymm13[3,4,5],ymm15[6],ymm13[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4,5,6,7,8],ymm0[9],ymm4[10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm11[2],ymm2[3,4,5],ymm11[6],ymm2[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm10
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5],xmm10[6],xmm4[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm10, %ymm10
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm0 & ~ymm19)
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
-; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm0, %xmm28
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm0 & ~ymm22)
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm10
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm28[0,1,2,1]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm10 = zmm10 ^ (zmm17 & (zmm10 ^ mem))
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm3 = zmm3 ^ (zmm18 & (zmm3 ^ mem))
 ; AVX512DQ-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm15
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm15[4],xmm11[5],xmm15[6],xmm11[7]
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm0 & ~ymm19)
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm12
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm6[2],ymm8[3,4,5],ymm6[6],ymm8[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ~ymm22)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm23
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm11 = zmm11 ^ (zmm17 & (zmm11 ^ mem))
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm1 = zmm1 ^ (zmm18 & (zmm1 ^ mem))
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm16[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-NEXT:    vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm11 = mem[0,1,2,3,6,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm27
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm19
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm26
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm13, %ymm13
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm13, %ymm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm27 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm27 = zmm27 ^ (zmm17 & (zmm27 ^ mem))
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm1 = mem[0,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm24
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm20
+; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm17
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm14[2],ymm7[3,4,5],ymm14[6],ymm7[7]
+; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm3
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm12, %ymm12
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm12, %ymm0
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm24 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm24 = zmm24 ^ (zmm18 & (zmm24 ^ mem))
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm29[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-NEXT:    vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm11 = mem[0,1,2,3,6,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm29
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm24
-; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm25
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm23
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm21
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0]
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm11, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
-; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm9, %ymm20
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7]
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm11, %ymm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,5,4]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm29 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm29 = zmm29 ^ (zmm17 & (zmm29 ^ mem))
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm26[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm1 = mem[0,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm29
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm26
+; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm22
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm15
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
+; AVX512DQ-NEXT:    vmovdqa %ymm10, %ymm8
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0]
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm13
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm9, %ymm23
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm11[4],xmm4[5],xmm11[6],xmm4[7]
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm29 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm29 = zmm29 ^ (zmm18 & (zmm29 ^ mem))
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm0, %ymm12
+; AVX512DQ-NEXT:    vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6],ymm12[7,8,9,10,11,12,13],ymm0[14],ymm12[15]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm31
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm30
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm8
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm18
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm12
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
-; AVX512DQ-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vmovdqa %ymm3, %ymm13
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7,8,9,10],ymm15[11],ymm12[12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm5
-; AVX512DQ-NEXT:    vmovdqa %ymm1, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3,4,5],xmm10[6],xmm15[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm28
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-NEXT:    vmovdqa64 %ymm14, %ymm27
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm11
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm21[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm31
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7,8,9,10],ymm4[11],ymm1[12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm12
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3,4,5],xmm12[6],xmm4[7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm12, %ymm12
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm10
-; AVX512DQ-NEXT:    vpor %ymm12, %ymm10, %ymm10
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm28
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm14
-; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm7 = mem ^ (zmm17 & (zmm7 ^ mem))
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm7
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4,5],ymm0[6],ymm7[7]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7,8,9,10,11,12,13],ymm0[14],ymm10[15]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm14 & (zmm12 ^ zmm0))
-; AVX512DQ-NEXT:    vmovdqa %ymm6, %ymm9
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm7
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm14
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm3
-; AVX512DQ-NEXT:    vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm23[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7,8,9,10],ymm11[11],ymm10[12,13,14,15]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm10, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm15
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm4, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm9 = mem ^ (zmm18 & (zmm9 ^ mem))
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-NEXT:    vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7]
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm18 & (zmm17 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7]
+; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm9
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm15[2,3],ymm8[4,5],ymm15[6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm8[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7,8,9,10],ymm4[11],ymm1[12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm6
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0],ymm14[1],ymm6[2,3],ymm14[4],ymm6[5,6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm10
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm17 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm13
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm20
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm13
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm21
-; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm25
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2],xmm2[3],xmm10[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm26
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm11[3],ymm4[4,5],ymm11[6],ymm4[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm26
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm17
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm22
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm5
-; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm11[2,3],ymm5[4,5],ymm11[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm18
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm10
+; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm11
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm12
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
-; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm12 = mem ^ (zmm24 & (zmm12 ^ mem))
+; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm12 = mem ^ (zmm18 & (zmm12 ^ mem))
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm30
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm14
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7]
+; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm15
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
-; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm31
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2],xmm2[3],xmm10[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm14, %ymm28
+; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm16
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm7
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15]
-; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm10
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm15
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
@@ -16377,15 +16432,15 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm24 & (zmm7 ^ zmm1))
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm1))
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm14 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
@@ -16394,19 +16449,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
 ; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm0
 ; AVX512DQ-NEXT:    vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[0,1,3,1]
@@ -16418,16 +16473,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm11 = mem ^ (zmm24 & (zmm11 ^ mem))
+; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm11 = mem ^ (zmm18 & (zmm11 ^ mem))
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1}
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm5
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
@@ -16435,22 +16490,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm24 & (zmm6 ^ zmm1))
-; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm18 & (zmm6 ^ zmm1))
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7]
+; AVX512DQ-NEXT:    vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm5 = mem[0,1],ymm5[2],mem[3,4,5],ymm5[6],mem[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,3,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3,4,5,6,7]
@@ -16466,14 +16521,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
 ; AVX512DQ-NEXT:    # zmm4 = mem ^ (zmm0 & (zmm4 ^ mem))
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm28 = zmm28 ^ (zmm0 & (zmm28 ^ mem))
+; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm30 = zmm30 ^ (zmm0 & (zmm30 ^ mem))
 ; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload
 ; AVX512DQ-NEXT:    # zmm23 = zmm23 ^ (zmm0 & (zmm23 ^ mem))
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm1))
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm4))
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm23, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm30, 64(%rsi)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%rdx)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -16481,14 +16536,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm0, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm29, 64(%r8)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm27, (%r8)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm24, (%r8)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm0, (%r9)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm0, 64(%r9)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 64(%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 64(%rax)
@@ -16498,708 +16553,745 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $1240, %rsp # imm = 0x4D8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [2,5,9,0,12,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm17, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm3, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 480(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT:    subq $1960, %rsp # imm = 0x7A8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa 448(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm12[2],ymm14[3,4,5],ymm12[6],ymm14[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vporq %ymm4, %ymm6, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 672(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm19
-; AVX512DQ-FCP-NEXT:    vpbroadcastw 700(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm26 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4,5],ymm11[6],ymm14[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 672(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm18
+; AVX512DQ-FCP-NEXT:    vpbroadcastw 700(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm24
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vporq %ymm1, %ymm3, %ymm31
-; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT:    vpbroadcastw 252(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 240(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm24[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermd %zmm15, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm5[2],ymm8[3,4,5],ymm5[6],ymm8[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm0
+; AVX512DQ-FCP-NEXT:    vpbroadcastw 252(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 240(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm15, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm16
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm16
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm13[0],xmm5[1],xmm13[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm13, %xmm27
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm14, %ymm23
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm17, %zmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm8, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 608(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm13[2],ymm4[3,4,5],ymm13[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm7, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm12[0],xmm9[1],xmm12[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm14, %ymm25
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 608(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm30
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %ymm22
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm22[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 688(%rdi), %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm14[0],xmm1[1],xmm14[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,6,9,0,13,0,0,0]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm0
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %ymm17
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm17[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 688(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0],xmm3[1],xmm8[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm10
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [2,6,9,0,13,0,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm9
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm15, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm4, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm15, %zmm18, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm25
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm16 = [2,5,2,5,2,5,2,5]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm24, %ymm16, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm15
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm13, %zmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm22
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm28
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm24[0,1,1,2]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15,16,17,18,19,20,21,22,23,16,17,18,19,16,17,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm12, %xmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,0,0,0,5,8,12,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm7
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm9
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm21
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm16, %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm14, %xmm30
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm13
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm17[0,1,1,2]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm8, %xmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 704(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 736(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm10
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm10[4],xmm2[5],xmm10[6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm16, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm28, %zmm16, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermd %zmm19, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm24[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm24[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm23
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2],xmm6[3],xmm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm16, %zmm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm13, %ymm8
-; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm26, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm22[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm29
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0],ymm3[1],ymm11[2,3,4],ymm3[5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm17[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm22
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastw 232(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm5
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm27, %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm4, %zmm1, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm21
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm21, %zmm27, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
-; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm26, %zmm1, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0],ymm2[1],ymm12[2,3,4],ymm2[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastw 232(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm8
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm27, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm27
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastw 680(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm4
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm30, %xmm2
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm18, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm26
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm24, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm1, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7],ymm6[8,9,10,11,12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm11
-; AVX512DQ-FCP-NEXT:    vpsrld $16, %xmm19, %xmm6
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 704(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 736(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm16
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm17
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3],xmm6[4],xmm3[5],xmm6[6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm30
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,4,7,11,14]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm20, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm14, %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3,4,5,6],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm12, %ymm8
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm27, %zmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,6],ymm8[7]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm31 {%k1} # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpermd %zmm21, %zmm1, %zmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpsrld $16, %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm31
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,4,7,0,0,4,7,0]
-; AVX512DQ-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm24, %ymm7, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,6,9,13,2,6,9,13]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm8, %zmm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm28
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [0,3,3,3,0,3,7,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm24, %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm21, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm8, %xmm17
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm4, %zmm3, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3],xmm14[4],xmm5[5],xmm14[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm20, %zmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm14, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3,4,5,6],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT:    vpermd %zmm14, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm20, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4,5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm14, %zmm8, %zmm10
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm20, %zmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm8, %zmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm16
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm14, %zmm27, %zmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm6 = zmm6 ^ (zmm25 & (zmm6 ^ mem))
-; AVX512DQ-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm19
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm19, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 {%k1} # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm14, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,4,7,11,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm16, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm14, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4,5,6],xmm14[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm14
+; AVX512DQ-FCP-NEXT:    vpermd %zmm14, %zmm21, %zmm12
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm12, %ymm12
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [4,5,6,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm12
+; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm3, %zmm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm2 {%k1} # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3],xmm12[4],xmm7[5],xmm12[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermd %zmm29, %zmm16, %zmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3,4,5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm21, %zmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm16
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,8,11,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm29, %zmm12, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4,5,6],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm18
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm11
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6],xmm11[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm7, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0,1,2],xmm6[3,4,5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermd %zmm14, %zmm9, %zmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm5[2],ymm0[3,4,5],ymm5[6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm14
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm27, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm1 = zmm1 ^ (zmm25 & (zmm1 ^ mem))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermd %zmm14, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm19, %zmm21, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [0,4,7,0,0,4,7,0]
+; AVX512DQ-FCP-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm24, %ymm30, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm21, %zmm8, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm9, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 416(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm8[2],ymm11[3,4,5],ymm8[6],ymm11[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 416(%rdi), %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5],ymm15[6],ymm14[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm9, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,11,2,11,12,5,8,9]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm29, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm23 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm23 = zmm23 ^ (zmm25 & (zmm23 ^ mem))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm1 = zmm1 ^ (zmm25 & (zmm1 ^ mem))
+; AVX512DQ-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
+; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 864(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 832(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm31
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm9, %zmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm22
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3,4,5],xmm3[6],xmm9[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 832(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm1[2],ymm4[3,4,5],ymm1[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm17
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm6, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2,3,4,5],xmm2[6],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm29 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm29 = zmm29 ^ (zmm25 & (zmm29 ^ mem))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm1 = zmm1 ^ (zmm25 & (zmm1 ^ mem))
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm17
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,3,7,10,14,0,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermd %zmm23, %zmm16, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm25 & (zmm27 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm15, %ymm19
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm14, %ymm20
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3,4,5],xmm1[6],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm18
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3,4,5],xmm12[6],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm22
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,0,0,0,6,9,13,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm29, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm1
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [0,3,7,10,14,0,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermd %zmm27, %zmm21, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermd %zmm15, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm25 & (zmm28 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm19
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm11
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3,4,5],xmm10[6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [2,0,0,0,6,9,13,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm16, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm28 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm21, %zmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm29
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm25 & (zmm26 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm16, %zmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm16
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm26 {%k1}
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm31
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm28 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm22
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm27
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [0,4,7,11,14,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm27, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm21
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm10
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3,4,5],xmm10[6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm15, %ymm19
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [3,0,0,0,6,10,13,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,4,7,11,14,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm23, %zmm17, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [3,0,0,0,6,10,13,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm29, %zmm18, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm10, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm12 = mem ^ (zmm25 & (zmm12 ^ mem))
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm19
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm15, %zmm17, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm15
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm18, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm1 = mem ^ (zmm25 & (zmm1 ^ mem))
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,4,8,11,15,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm23, %zmm3, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm9 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm9 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm9
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm29, %zmm7, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem ^ (zmm25 & (zmm0 ^ mem))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm3[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm20
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm16, %zmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm22
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm4, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4],ymm15[5],ymm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm9 = mem ^ (zmm25 & (zmm9 ^ mem))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,4,8,11,15,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm27, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm6
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,1,10,3,14,7,10,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm16, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm12 = mem ^ (zmm25 & (zmm12 ^ mem))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm5, %zmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4],ymm13[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm11
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1],xmm11[2],xmm4[3],xmm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm16, %zmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm7, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm7, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1,2],ymm5[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm1 = mem ^ (zmm25 & (zmm1 ^ mem))
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm9 ^ (zmm25 & (zmm4 ^ zmm9))
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm4 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm2 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm7 = mem ^ (zmm2 & (zmm7 ^ mem))
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm9 = mem ^ (zmm2 & (zmm9 ^ mem))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ mem))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ mem))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm3 & (zmm6 ^ zmm7))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm3 & (zmm8 ^ zmm9))
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm3 = mem ^ (zmm2 & (zmm3 ^ mem))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm4 = mem ^ (zmm2 & (zmm4 ^ mem))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm5 = zmm5 ^ (zmm2 & (zmm5 ^ mem))
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm6 = zmm6 ^ (zmm2 & (zmm6 ^ mem))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm2 & (zmm10 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm2 & (zmm8 ^ zmm4))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rsi)
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm3 = zmm3 ^ (zmm25 & (zmm3 ^ mem))
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm7 = zmm7 ^ (zmm25 & (zmm7 ^ mem))
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 {%k1} # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 {%k1} # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 64(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 64(%r8)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, 64(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, (%r9)
-; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 64(%r9)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%r9)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, (%rax)
-; AVX512DQ-FCP-NEXT:    addq $1240, %rsp # imm = 0x4D8
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm1, (%rax)
+; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT:    addq $1960, %rsp # imm = 0x7A8
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index fff21f9aad1bb..2bb2962be73aa 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -496,15 +496,15 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
-; AVX512-NEXT:    vpermt2d %xmm4, %xmm9, %xmm5
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [3,11,3,3]
+; AVX512-NEXT:    vpermt2d %ymm4, %ymm9, %ymm5
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-NEXT:    vpermt2d %xmm2, %xmm9, %xmm0
+; AVX512-NEXT:    vpermi2d %ymm2, %ymm0, %ymm9
 ; AVX512-NEXT:    vmovq %xmm6, (%rsi)
 ; AVX512-NEXT:    vmovq %xmm7, (%rdx)
 ; AVX512-NEXT:    vmovq %xmm8, (%rcx)
@@ -512,7 +512,8 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512-NEXT:    vmovq %xmm3, (%r11)
 ; AVX512-NEXT:    vmovq %xmm4, (%r10)
-; AVX512-NEXT:    vmovq %xmm0, (%rax)
+; AVX512-NEXT:    vmovq %xmm9, (%rax)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride8_vf4:
@@ -527,18 +528,18 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, %xmm8
-; AVX512-FCP-NEXT:    vpermt2d %xmm4, %xmm7, %xmm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,9,1,1]
+; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm8
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm7, %ymm8
 ; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm4, %xmm10, %xmm5
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [3,11,3,3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm10, %ymm5
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm0, %xmm7
+; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm0, %ymm7
 ; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm2, %xmm10, %xmm0
+; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm0, %ymm10
 ; AVX512-FCP-NEXT:    vmovq %xmm6, (%rsi)
 ; AVX512-FCP-NEXT:    vmovq %xmm8, (%rdx)
 ; AVX512-FCP-NEXT:    vmovq %xmm9, (%rcx)
@@ -546,7 +547,8 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512-FCP-NEXT:    vmovq %xmm7, (%r11)
 ; AVX512-FCP-NEXT:    vmovq %xmm3, (%r10)
-; AVX512-FCP-NEXT:    vmovq %xmm0, (%rax)
+; AVX512-FCP-NEXT:    vmovq %xmm10, (%rax)
+; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride8_vf4:
@@ -564,15 +566,15 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
-; AVX512DQ-NEXT:    vpermt2d %xmm4, %xmm9, %xmm5
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [3,11,3,3]
+; AVX512DQ-NEXT:    vpermt2d %ymm4, %ymm9, %ymm5
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-NEXT:    vpermt2d %xmm2, %xmm9, %xmm0
+; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm0, %ymm9
 ; AVX512DQ-NEXT:    vmovq %xmm6, (%rsi)
 ; AVX512DQ-NEXT:    vmovq %xmm7, (%rdx)
 ; AVX512DQ-NEXT:    vmovq %xmm8, (%rcx)
@@ -580,7 +582,8 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512DQ-NEXT:    vmovq %xmm3, (%r11)
 ; AVX512DQ-NEXT:    vmovq %xmm4, (%r10)
-; AVX512DQ-NEXT:    vmovq %xmm0, (%rax)
+; AVX512DQ-NEXT:    vmovq %xmm9, (%rax)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf4:
@@ -595,18 +598,18 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm4, %xmm7, %xmm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,9,1,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm8
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm7, %ymm8
 ; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm4, %xmm10, %xmm5
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm10, %ymm5
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm0, %xmm7
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm0, %ymm7
 ; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm2, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm0, %ymm10
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm8, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm9, (%rcx)
@@ -614,7 +617,8 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm7, (%r11)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%r10)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm10, (%rax)
+; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i16_stride8_vf4:
@@ -1118,254 +1122,256 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512-LABEL: load_i16_stride8_vf8:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,0,8]
 ; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-NEXT:    vpermt2d %xmm5, %xmm0, %xmm6
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm10
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm12
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[1,1,1,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0]
-; AVX512-NEXT:    vpermt2d %xmm13, %xmm15, %xmm14
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX512-NEXT:    vpermt2d %ymm5, %ymm0, %ymm6
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm11
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm12
+; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm13
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm15[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,11,0,0]
+; AVX512-NEXT:    vpermt2d %ymm14, %ymm8, %ymm15
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-NEXT:    vpermi2d %xmm1, %xmm2, %xmm0
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0],xmm3[1],xmm11[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512-NEXT:    vpermt2d %xmm3, %xmm15, %xmm4
+; AVX512-NEXT:    vpermi2d %ymm3, %ymm4, %ymm8
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX512-NEXT:    vmovdqa %xmm6, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm7, (%rdx)
-; AVX512-NEXT:    vmovdqa %xmm8, (%rcx)
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
+; AVX512-NEXT:    vmovdqa %xmm5, (%rsi)
+; AVX512-NEXT:    vmovdqa %xmm6, (%rdx)
+; AVX512-NEXT:    vmovdqa %xmm7, (%rcx)
 ; AVX512-NEXT:    vmovdqa %xmm9, (%r8)
 ; AVX512-NEXT:    vmovdqa %xmm0, (%r9)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa %xmm5, (%rax)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa %xmm10, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    vmovdqa %xmm3, (%rax)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rax)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride8_vf8:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa 112(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,0,0,8]
+; AVX512-FCP-NEXT:    vmovdqa 112(%rdi), %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm17
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4]
-; AVX512-FCP-NEXT:    vmovdqa %xmm11, %xmm0
-; AVX512-FCP-NEXT:    vpermt2d %xmm10, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm8
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm13
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm8
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX512-FCP-NEXT:    vmovdqa %ymm14, %ymm0
+; AVX512-FCP-NEXT:    vpermt2d %ymm12, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
+; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm11
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm16
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0]
-; AVX512-FCP-NEXT:    vmovdqa %xmm15, %xmm2
-; AVX512-FCP-NEXT:    vpermt2d %xmm14, %xmm9, %xmm2
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm11, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6]
-; AVX512-FCP-NEXT:    vpermt2d %xmm10, %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0]
-; AVX512-FCP-NEXT:    vpermt2d %xmm14, %xmm10, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm14
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512-FCP-NEXT:    vpermi2d %xmm4, %xmm5, %xmm3
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
-; AVX512-FCP-NEXT:    vpermi2d %xmm6, %xmm7, %xmm9
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512-FCP-NEXT:    vpermi2d %xmm4, %xmm5, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm6, %xmm10, %xmm7
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,9,0,0]
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm17
+; AVX512-FCP-NEXT:    vmovdqa %ymm14, %ymm15
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512-FCP-NEXT:    vpermt2d %ymm12, %ymm14, %ymm15
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm15 = [3,11,3,3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm15, %ymm13
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT:    vpermi2d %ymm3, %ymm4, %ymm7
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
+; AVX512-FCP-NEXT:    vpermi2d %ymm3, %ymm4, %ymm15
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3]
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm11, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm12, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm13, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rax)
+; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride8_vf8:
 ; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,0,8]
 ; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm4
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-NEXT:    vpermt2d %xmm5, %xmm0, %xmm6
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm10
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm12
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[1,1,1,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0]
-; AVX512DQ-NEXT:    vpermt2d %xmm13, %xmm15, %xmm14
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX512DQ-NEXT:    vpermt2d %ymm5, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm11
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm12
+; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm13
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm15[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,11,0,0]
+; AVX512DQ-NEXT:    vpermt2d %ymm14, %ymm8, %ymm15
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-NEXT:    vpermi2d %xmm1, %xmm2, %xmm0
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0],xmm3[1],xmm11[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512DQ-NEXT:    vpermt2d %xmm3, %xmm15, %xmm4
+; AVX512DQ-NEXT:    vpermi2d %ymm3, %ymm4, %ymm8
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX512DQ-NEXT:    vmovdqa %xmm6, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %xmm7, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa %xmm8, (%rcx)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
+; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rsi)
+; AVX512DQ-NEXT:    vmovdqa %xmm6, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %xmm7, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm9, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, (%r9)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rax)
-; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa %xmm10, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rax)
+; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa %xmm1, (%rax)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf8:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,0,0,8]
+; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rdi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm17
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm10, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm13
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm12, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm11
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm16
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm15, %xmm2
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm14, %xmm9, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm10, %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm14, %xmm10, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm14
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm4, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm6, %xmm7, %xmm9
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm4, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm6, %xmm10, %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm14, %ymm15
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm12, %ymm14, %ymm15
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm15 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm15, %ymm13
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm3, %ymm4, %ymm7
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm3, %ymm4, %ymm15
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3]
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm12, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm13, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%r9)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rax)
+; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i16_stride8_vf8:
@@ -2477,6 +2483,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-LABEL: load_i16_stride8_vf16:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,0,0,8]
 ; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2487,164 +2494,164 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm26
 ; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm27
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
 ; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm30
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-NEXT:    vpermt2d %xmm5, %xmm7, %xmm6
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm10
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-NEXT:    vpermt2d %ymm5, %ymm7, %ymm6
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm11
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm12
 ; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm13
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX512-NEXT:    vmovdqa64 224(%rdi), %ymm18
-; AVX512-NEXT:    vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm20
-; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
-; AVX512-NEXT:    vmovdqa64 160(%rdi), %ymm23
-; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2]
+; AVX512-NEXT:    vmovdqa64 224(%rdi), %ymm20
+; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm20[0,1,0,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm22
+; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm22[0,1,0,2]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm19[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vmovdqa64 160(%rdi), %ymm17
+; AVX512-NEXT:    vpermq {{.*#+}} ymm23 = ymm17[0,1,0,2]
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm16
-; AVX512-NEXT:    vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm28
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm16[0,1,0,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm23[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm28
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm29
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm19[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm23[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm21[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
 ; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm6
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
-; AVX512-NEXT:    vpermt2d %xmm4, %xmm17, %xmm14
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [3,11,0,0]
+; AVX512-NEXT:    vpermt2d %ymm8, %ymm9, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm0
 ; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm1
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm0
 ; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm2
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512-NEXT:    vpermi2d %xmm1, %xmm2, %xmm7
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT:    vpermi2d %ymm1, %ymm2, %ymm7
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm20[0,1,1,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm22[0,1,1,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm17[0,1,1,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm14[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm16[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT:    vpermt2d %xmm4, %xmm17, %xmm3
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0],xmm3[1],xmm7[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm14[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm16[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-NEXT:    vpermi2d %ymm3, %ymm4, %ymm9
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm28, (%rsi)
 ; AVX512-NEXT:    vmovdqa64 %ymm29, (%rdx)
 ; AVX512-NEXT:    vmovdqa64 %ymm19, (%rcx)
 ; AVX512-NEXT:    vmovdqa %ymm8, (%r8)
-; AVX512-NEXT:    vmovdqa %ymm10, (%r9)
+; AVX512-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa %ymm7, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa %ymm5, (%rax)
+; AVX512-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa %ymm0, (%rax)
+; AVX512-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride8_vf16:
 ; AVX512-FCP:       # %bb.0:
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,0,0,8]
 ; AVX512-FCP-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2652,12 +2659,11 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm27
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm28
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm29
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
-; AVX512-FCP-NEXT:    vmovdqa %xmm12, %xmm13
-; AVX512-FCP-NEXT:    vpermt2d %xmm4, %xmm7, %xmm13
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm12
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm7, %ymm12
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm9
@@ -2667,143 +2673,143 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm30
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm31
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm16
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm16[0,1,0,2]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm19[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
-; AVX512-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm19
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
+; AVX512-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm18
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm21 = ymm18[0,1,0,2]
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
-; AVX512-FCP-NEXT:    vmovdqa %xmm11, %xmm1
-; AVX512-FCP-NEXT:    vpermt2d %xmm5, %xmm15, %xmm1
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [1,9,0,0]
+; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm0
+; AVX512-FCP-NEXT:    vpermt2d %ymm5, %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm25
-; AVX512-FCP-NEXT:    vmovdqa %xmm12, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6]
-; AVX512-FCP-NEXT:    vpermt2d %xmm4, %xmm12, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm14, %ymm0
 ; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm19[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm22[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0]
-; AVX512-FCP-NEXT:    vpermt2d %xmm5, %xmm18, %xmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm13 = [3,11,3,3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm5, %ymm13, %ymm11
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm1
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm1
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpermi2d %xmm3, %xmm4, %xmm7
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX512-FCP-NEXT:    vpermi2d %ymm3, %ymm4, %ymm7
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm16[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm17[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm18[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermi2d %xmm6, %xmm5, %xmm15
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermi2d %xmm3, %xmm4, %xmm12
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm0[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm16[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermi2d %ymm5, %ymm6, %ymm12
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermi2d %ymm3, %ymm4, %ymm14
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm19[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm17[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %xmm6, %xmm18, %xmm5
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm15[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpermi2d %ymm5, %ymm6, %ymm13
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm16[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rax)
@@ -2816,6 +2822,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-LABEL: load_i16_stride8_vf16:
 ; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,0,0,8]
 ; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2826,164 +2833,164 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm26
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm27
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm30
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-NEXT:    vpermt2d %xmm5, %xmm7, %xmm6
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm10
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-NEXT:    vpermt2d %ymm5, %ymm7, %ymm6
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm11
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm12
 ; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm13
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 224(%rdi), %ymm18
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm20
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-NEXT:    vmovdqa64 160(%rdi), %ymm23
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2]
+; AVX512DQ-NEXT:    vmovdqa64 224(%rdi), %ymm20
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm20[0,1,0,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm22
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm19 = ymm22[0,1,0,2]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm19[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 160(%rdi), %ymm17
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm17[0,1,0,2]
 ; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm16
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm28
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm16[0,1,0,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm23[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm28
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm29
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm19[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm23[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm21[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm6
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
-; AVX512DQ-NEXT:    vpermt2d %xmm4, %xmm17, %xmm14
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [3,11,0,0]
+; AVX512DQ-NEXT:    vpermt2d %ymm8, %ymm9, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm1
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm2
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512DQ-NEXT:    vpermi2d %xmm1, %xmm2, %xmm7
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512DQ-NEXT:    vpermi2d %ymm1, %ymm2, %ymm7
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm20[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm22[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm17[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm14[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm16[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT:    vpermt2d %xmm4, %xmm17, %xmm3
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0],xmm3[1],xmm7[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm14[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm16[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-NEXT:    vpermi2d %ymm3, %ymm4, %ymm9
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm28, (%rsi)
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm29, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm19, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa %ymm8, (%r8)
-; AVX512DQ-NEXT:    vmovdqa %ymm10, (%r9)
+; AVX512DQ-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa %ymm7, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa %ymm5, (%rax)
+; AVX512DQ-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rax)
+; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf16:
 ; AVX512DQ-FCP:       # %bb.0:
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,0,0,8]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2991,12 +2998,11 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm27
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm29
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm12, %xmm13
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm4, %xmm7, %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm12
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm7, %ymm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm9
@@ -3006,143 +3012,143 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm31
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm16
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm16[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm19[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm19
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm18
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm21 = ymm18[0,1,0,2]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm5, %xmm15, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm5, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm25
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm12, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm4, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm14, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm19[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm22[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm5, %xmm18, %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm13 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm5, %ymm13, %ymm11
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm3, %xmm4, %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm3, %ymm4, %ymm7
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm16[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm17[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm18[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm6, %xmm5, %xmm15
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm3, %xmm4, %xmm12
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm0[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm16[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm5, %ymm6, %ymm12
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm3, %ymm4, %ymm14
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm19[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm17[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm6, %xmm18, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm15[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm5, %ymm6, %ymm13
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm16[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%r9)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rax)
@@ -5719,80 +5725,80 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-LABEL: load_i16_stride8_vf32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    subq $616, %rsp # imm = 0x268
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,0,0,8]
 ; AVX512-NEXT:    vmovdqa 368(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa 352(%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-NEXT:    vmovdqa 336(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 320(%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm26
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512-NEXT:    vmovdqa %xmm5, %xmm0
-; AVX512-NEXT:    vpermt2d %xmm3, %xmm1, %xmm0
-; AVX512-NEXT:    vmovdqa 304(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 320(%rdi), %xmm2
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 288(%rdi), %xmm3
-; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa 272(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm26 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT:    vpermt2d %ymm1, %ymm3, %ymm2
+; AVX512-NEXT:    vmovdqa 304(%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa 288(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vmovdqa 272(%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa 256(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm12[0],xmm7[1],xmm12[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm16
 ; AVX512-NEXT:    vmovdqa 480(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm21
-; AVX512-NEXT:    vmovdqa 448(%rdi), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512-NEXT:    vmovdqa 448(%rdi), %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm1[0,1,0,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm21[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm19
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT:    vmovdqa 416(%rdi), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2]
-; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa 416(%rdi), %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm1[0,1,0,2]
+; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm28[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512-NEXT:    movb $-64, %al
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1}
-; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm3
-; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm2
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3]
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm23
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT:    vpermt2d %xmm0, %xmm1, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm8[3]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm22
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm17 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm23 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vpermt2d %ymm0, %ymm3, %ymm2
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
@@ -5803,27 +5809,27 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm14[0,1],xmm2[2,3]
 ; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2]
 ; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2]
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm28
-; AVX512-NEXT:    vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[0,1,0,2]
+; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm27
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm27[0,1,0,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm15[0,2,2,3,4,6,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[0,2,2,3,4,6,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
@@ -5831,20 +5837,21 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm4
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm4
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm6
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -5853,66 +5860,67 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm1
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm28[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm29[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5],ymm11[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm15
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm15[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm15
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0]
-; AVX512-NEXT:    vpermt2d %xmm11, %xmm6, %xmm7
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm0
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [3,11,0,0]
+; AVX512-NEXT:    vpermt2d %ymm12, %ymm0, %ymm7
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT:    vpermt2d %xmm8, %xmm6, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT:    vpermt2d %ymm8, %ymm30, %ymm3
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -5925,167 +5933,166 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm30
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm1, %xmm2
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512-NEXT:    vpermt2d %xmm0, %xmm1, %xmm2
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm31
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm18 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm21 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,0,0,8]
+; AVX512-NEXT:    vpermt2d %ymm0, %ymm3, %ymm1
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm20
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm12
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm19 = mem[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm21 = mem[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm23
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm11
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm28 = mem[0,1,1,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm28[0,1,2,0,4,5,6,4]
 ; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm29 = mem[0,1,1,3]
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm23 = mem[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1}
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm20 = mem[0,1,1,3]
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm19 = mem[0,1,1,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm11 {%k1}
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX512-NEXT:    vpermi2d %xmm2, %xmm3, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm18
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT:    vmovdqa %ymm3, %ymm0
+; AVX512-NEXT:    vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm10 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3]
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm17 = mem[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm24 = mem[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm24[0,1,2,0,4,5,6,4]
 ; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm25 = mem[0,1,1,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm25[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm26 = mem[0,1,1,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm27 = ymm27[0,1,1,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm26[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5],ymm7[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm27
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm12
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm22
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm11
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm14[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm9
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm20
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm16
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm1
+; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm23
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX512-NEXT:    vmovdqa %ymm11, %ymm12
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm20[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm19[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm24[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm25[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0]
-; AVX512-NEXT:    vpermt2d %xmm16, %xmm12, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm11
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm4, %zmm0, %zmm4
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1}
-; AVX512-NEXT:    vpermt2d %xmm8, %xmm12, %xmm5
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm27[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT:    vpermt2d %ymm12, %ymm30, %ymm6
+; AVX512-NEXT:    vpermi2d %ymm9, %ymm10, %ymm30
+; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm12
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm16[2,2,2,2]
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm9
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm1
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm16[2],xmm9[3],xmm16[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vmovaps %zmm2, (%rsi)
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
@@ -6094,9 +6101,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovaps %zmm2, (%rcx)
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vmovaps %zmm2, (%r8)
-; AVX512-NEXT:    vmovdqa64 %zmm27, (%r9)
+; AVX512-NEXT:    vmovdqa64 %zmm22, (%r9)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm20, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm23, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -6107,42 +6114,42 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-FCP-LABEL: load_i16_stride8_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $552, %rsp # imm = 0x228
+; AVX512-FCP-NEXT:    subq $424, %rsp # imm = 0x1A8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
 ; AVX512-FCP-NEXT:    vmovdqa 368(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vmovdqa 336(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512-FCP-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm22
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa 304(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vmovdqa 272(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm20
+; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm19
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm11
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa 480(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm21[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512-FCP-NEXT:    vmovdqa 448(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2]
@@ -6155,28 +6162,28 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm28[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm29[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512-FCP-NEXT:    movb $-64, %al
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm13, %xmm0
-; AVX512-FCP-NEXT:    vpermt2d %xmm3, %xmm2, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm25
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa %ymm14, %ymm0
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
@@ -6186,384 +6193,384 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm4
 ; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm17
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm23
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm24
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm31 = ymm24[0,1,0,2]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm31[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm25
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm18 = ymm25[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm26
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm26[0,1,0,2]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[0,2,2,3,4,6,6,7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0]
-; AVX512-FCP-NEXT:    vmovdqa %xmm9, %xmm0
-; AVX512-FCP-NEXT:    vpermt2d %xmm20, %xmm11, %xmm0
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,9,0,0]
+; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm0
+; AVX512-FCP-NEXT:    vpermt2d %ymm19, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm15
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm23[0],xmm22[0],xmm23[1],xmm22[1]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa %xmm6, %xmm1
-; AVX512-FCP-NEXT:    vpermt2d %xmm17, %xmm11, %xmm1
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm1
+; AVX512-FCP-NEXT:    vpermt2d %ymm16, %ymm15, %ymm1
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm14[0],xmm20[0],xmm14[1],xmm20[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
-; AVX512-FCP-NEXT:    vpermt2d %xmm22, %xmm0, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa %xmm13, %xmm7
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm25, %xmm0, %xmm7
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0]
-; AVX512-FCP-NEXT:    vpermt2d %xmm20, %xmm27, %xmm9
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm22 = xmm23[2],xmm2[2],xmm23[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,2,10]
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm9
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm19[2],xmm11[3],xmm19[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm28[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm29[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5],ymm8[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa %ymm14, %ymm3
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm20[2],xmm14[3],xmm20[3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm20, %ymm9, %ymm3
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm31[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5,6],ymm8[7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm18[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm17[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm31
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,11,3,3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm19, %ymm2, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm30
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vpermt2d %xmm17, %xmm27, %xmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vpermt2d %ymm16, %ymm30, %ymm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm5[2,3]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm28
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm27
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,0,0,8]
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm18
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm17
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm29
+; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm23 = mem[0,1,1,3]
+; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm22 = mem[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm23[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm22[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm21 = mem[0,1,1,3]
+; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm20 = mem[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm29 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm12, %xmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
-; AVX512-FCP-NEXT:    vpermt2d %xmm3, %xmm0, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm24
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm18
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm31
-; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm29 = mem[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm26 = mem[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpermi2d %ymm3, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm16
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm28 = mem[0,1,1,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm24[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm25 = ymm25[0,1,1,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm26 = ymm26[0,1,1,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm25[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm29
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [1,9,0,0]
+; AVX512-FCP-NEXT:    vpermt2d %ymm17, %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm18[0],xmm8[1],xmm18[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm30 = mem[0,1,1,3]
-; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm25 = mem[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm17
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm16
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm20 = mem[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm19 = mem[0,1,1,3]
-; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm21 = mem[0,1,1,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm31, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0]
-; AVX512-FCP-NEXT:    vpermt2d %xmm18, %xmm4, %xmm0
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, %xmm3
-; AVX512-FCP-NEXT:    vpermi2d %xmm16, %xmm7, %xmm3
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm12, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm24
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
-; AVX512-FCP-NEXT:    vpermt2d %xmm2, %xmm0, %xmm8
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vpermi2d %xmm22, %xmm17, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpermt2d %xmm18, %xmm27, %xmm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm3
+; AVX512-FCP-NEXT:    vpermi2d %ymm10, %ymm11, %ymm3
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm16[0],xmm19[0],xmm16[1],xmm19[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm18
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm14, %ymm13
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm13[2,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm23[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm22[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm21[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm20[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5],ymm12[6,7]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm3 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa %ymm14, %ymm8
+; AVX512-FCP-NEXT:    vpermi2d %ymm19, %ymm16, %ymm8
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm8[2,3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm24[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
+; AVX512-FCP-NEXT:    vpermt2d %ymm17, %ymm30, %ymm6
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm25[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpermi2d %ymm10, %ymm11, %ymm1
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm26[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5],ymm15[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm6
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm19[2],xmm16[3],xmm19[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512-FCP-NEXT:    vpermt2d %xmm16, %xmm27, %xmm7
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm2, (%rsi)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm2, (%rdx)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, (%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512-FCP-NEXT:    addq $552, %rsp # imm = 0x228
+; AVX512-FCP-NEXT:    addq $424, %rsp # imm = 0x1A8
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride8_vf32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    subq $616, %rsp # imm = 0x268
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,0,0,8]
 ; AVX512DQ-NEXT:    vmovdqa 368(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-NEXT:    vmovdqa 336(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm26
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm5, %xmm0
-; AVX512DQ-NEXT:    vpermt2d %xmm3, %xmm1, %xmm0
-; AVX512DQ-NEXT:    vmovdqa 304(%rdi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa 272(%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm26 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-NEXT:    vpermt2d %ymm1, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vmovdqa 304(%rdi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vmovdqa 272(%rdi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm12[0],xmm7[1],xmm12[1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm16
 ; AVX512DQ-NEXT:    vmovdqa 480(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm21
-; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm1[0,1,0,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm21[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm19
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2]
-; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm1[0,1,0,2]
+; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm28[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-NEXT:    movb $-64, %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm23
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-NEXT:    vpermt2d %xmm0, %xmm1, %xmm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm8[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm22
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm17 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm23 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-NEXT:    vpermt2d %ymm0, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
@@ -6574,27 +6581,27 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm14[0,1],xmm2[2,3]
 ; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2]
 ; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2]
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm28
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[0,1,0,2]
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm27
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm27[0,1,0,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm15[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
@@ -6602,20 +6609,21 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm4
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm6
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -6624,66 +6632,67 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm1
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm28[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm29[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5],ymm11[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm15
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm15[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm15
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0]
-; AVX512DQ-NEXT:    vpermt2d %xmm11, %xmm6, %xmm7
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm0
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [3,11,0,0]
+; AVX512DQ-NEXT:    vpermt2d %ymm12, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vpermt2d %xmm8, %xmm6, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT:    vpermt2d %ymm8, %ymm30, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -6696,167 +6705,166 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm30
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm1, %xmm2
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512DQ-NEXT:    vpermt2d %xmm0, %xmm1, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm31
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm18 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm21 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,0,0,8]
+; AVX512DQ-NEXT:    vpermt2d %ymm0, %ymm3, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm20
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm12
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm19 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm21 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm23
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm11
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm28 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm28[0,1,2,0,4,5,6,4]
 ; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm29 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm23 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1}
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm20 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm19 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm11 {%k1}
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX512DQ-NEXT:    vpermi2d %xmm2, %xmm3, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm18
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vmovdqa %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm10 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3]
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm17 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm24 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm24[0,1,2,0,4,5,6,4]
 ; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm25 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm25[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm26 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm27 = ymm27[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm26[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5],ymm7[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm27
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm12
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm22
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm11
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm14[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm9
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm20
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm16
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm1
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm23
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm12
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm20[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm19[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm24[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm25[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0]
-; AVX512DQ-NEXT:    vpermt2d %xmm16, %xmm12, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm4, %zmm0, %zmm4
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1}
-; AVX512DQ-NEXT:    vpermt2d %xmm8, %xmm12, %xmm5
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm27[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-NEXT:    vpermt2d %ymm12, %ymm30, %ymm6
+; AVX512DQ-NEXT:    vpermi2d %ymm9, %ymm10, %ymm30
+; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm12
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm16[2,2,2,2]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm16[2],xmm9[3],xmm16[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm2, (%rsi)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
@@ -6865,9 +6873,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovaps %zmm2, (%rcx)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm2, (%r8)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm27, (%r9)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm22, (%r9)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm23, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -6878,42 +6886,42 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $552, %rsp # imm = 0x228
+; AVX512DQ-FCP-NEXT:    subq $424, %rsp # imm = 0x1A8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 368(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 336(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm22
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa 304(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 272(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm19
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa 480(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm21[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 448(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2]
@@ -6926,28 +6934,28 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm28[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm29[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-FCP-NEXT:    movb $-64, %al
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm3, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm25
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
@@ -6957,304 +6965,304 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm17
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm24
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm31 = ymm24[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm31[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm25
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm18 = ymm25[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm26
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm26[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[0,2,2,3,4,6,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm20, %xmm11, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm19, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm23[0],xmm22[0],xmm23[1],xmm22[1]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm17, %xmm11, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm16, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm14[0],xmm20[0],xmm14[1],xmm20[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm22, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm13, %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm25, %xmm0, %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm20, %xmm27, %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm22 = xmm23[2],xmm2[2],xmm23[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm9
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm19[2],xmm11[3],xmm19[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm28[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm29[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm14, %ymm3
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm20[2],xmm14[3],xmm20[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm20, %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm31[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm18[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm17[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm31
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm19, %ymm2, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm17, %xmm27, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm16, %ymm30, %ymm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm28
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,0,0,8]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm18
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm17
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm29
+; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm23 = mem[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm22 = mem[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm23[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm22[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm21 = mem[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm20 = mem[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm29 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm12, %xmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm3, %xmm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm24
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm18
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm31
-; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm29 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm26 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm3, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm28 = mem[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm24[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm25 = ymm25[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm26 = ymm26[0,1,1,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm25[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm12 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm17, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm18[0],xmm8[1],xmm18[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm30 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm25 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm16
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm20 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm19 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm21 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm31, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm18, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm16, %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm12, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm24
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm2, %xmm0, %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm22, %xmm17, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm18, %xmm27, %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm4 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm3
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm10, %ymm11, %ymm3
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm16[0],xmm19[0],xmm16[1],xmm19[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm18
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm14, %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm13[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm23[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm22[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm21[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm20[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5],ymm12[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm14, %ymm8
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm19, %ymm16, %ymm8
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm8[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm24[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm17, %ymm30, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm25[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm10, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm26[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5],ymm15[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm6
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm19[2],xmm16[3],xmm19[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm16, %xmm27, %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, (%r9)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-FCP-NEXT:    addq $552, %rsp # imm = 0x228
+; AVX512DQ-FCP-NEXT:    addq $424, %rsp # imm = 0x1A8
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -12647,39 +12655,40 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-LABEL: load_i16_stride8_vf64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    subq $2408, %rsp # imm = 0x968
-; AVX512-NEXT:    vmovdqa 368(%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 352(%rdi), %xmm0
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
+; AVX512-NEXT:    vmovdqa 368(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT:    vmovdqa 336(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 320(%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
+; AVX512-NEXT:    vmovdqa 352(%rdi), %xmm2
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vmovdqa 336(%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa 320(%rdi), %xmm2
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermt2d %xmm0, %xmm6, %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %ymm1, %ymm5
+; AVX512-NEXT:    vpermt2d %ymm3, %ymm1, %ymm4
 ; AVX512-NEXT:    vmovdqa 304(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 288(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa 288(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 272(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 256(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm31
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT:    vmovdqa 256(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa 480(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -12702,12 +12711,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
-; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
@@ -12732,7 +12741,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermt2d %xmm1, %xmm6, %xmm2
+; AVX512-NEXT:    vpermt2d %ymm1, %ymm5, %ymm2
+; AVX512-NEXT:    vmovdqa %ymm5, %ymm6
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm3
@@ -12742,10 +12752,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm5
 ; AVX512-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm17
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -12792,11 +12802,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermt2d %xmm0, %xmm6, %xmm1
-; AVX512-NEXT:    vmovdqa %xmm6, %xmm10
+; AVX512-NEXT:    vpermt2d %ymm0, %ymm6, %ymm1
+; AVX512-NEXT:    vmovdqa %ymm6, %ymm10
 ; AVX512-NEXT:    vmovdqa 816(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa 800(%rdi), %xmm2
@@ -12808,8 +12818,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm19
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm20
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm20
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa 992(%rdi), %ymm1
@@ -12851,11 +12861,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
-; AVX512-NEXT:    vmovdqa %xmm5, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermt2d %xmm1, %xmm10, %xmm2
+; AVX512-NEXT:    vpermt2d %ymm1, %ymm10, %ymm2
 ; AVX512-NEXT:    vmovdqa 560(%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa 544(%rdi), %xmm5
@@ -12870,12 +12880,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3]
 ; AVX512-NEXT:    vmovdqa 736(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm1[0,1,0,2]
 ; AVX512-NEXT:    vmovdqa 704(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm30[0,1,2,0,4,5,6,4]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
@@ -12885,10 +12895,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2]
 ; AVX512-NEXT:    vmovdqa 640(%rdi), %ymm5
 ; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm5[0,1,0,2]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm29[0,2,2,3,4,6,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7]
@@ -12896,7 +12906,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
@@ -12924,7 +12934,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3]
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm9 = xmm9[0,1],mem[2,3]
@@ -12932,9 +12942,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm9
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm9
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
-; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vpblendd $12, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
@@ -12946,8 +12956,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
-; AVX512-NEXT:    vmovdqa %xmm8, %xmm5
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3]
 ; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
@@ -12961,7 +12971,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3]
-; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm16
+; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm16
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -13000,14 +13010,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3]
-; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm22
+; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm22
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm24
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm9[2],xmm19[3],xmm9[3]
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm24
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm20
@@ -13027,10 +13037,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3]
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm18
-; AVX512-NEXT:    vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm18
+; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm3 = xmm0[0,1],mem[2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm30[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm5 = mem[0,1,1,3,4,5,5,7]
@@ -13038,17 +13048,17 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm29[3,1,2,3,7,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm1
-; AVX512-NEXT:    vpermt2d %xmm16, %xmm0, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm16
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [3,11,0,0]
+; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm1
+; AVX512-NEXT:    vpermt2d %ymm16, %ymm0, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm16
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm1[0,1],mem[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -13064,8 +13074,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512-NEXT:    vpermt2d %xmm22, %xmm16, %xmm1
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm1
+; AVX512-NEXT:    vpermt2d %ymm22, %ymm16, %ymm1
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
 ; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm3
@@ -13079,8 +13089,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm0
-; AVX512-NEXT:    vpermt2d %xmm24, %xmm16, %xmm0
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512-NEXT:    vpermt2d %ymm24, %ymm16, %ymm0
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -13092,7 +13102,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT:    vpermt2d %xmm18, %xmm16, %xmm15
+; AVX512-NEXT:    vpermt2d %ymm18, %ymm16, %ymm15
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm1 = xmm15[0,1],mem[2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -13109,27 +13119,26 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4]
-; AVX512-NEXT:    vpermt2d %xmm5, %xmm10, %xmm6
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
 ; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm28
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [0,0,0,8]
+; AVX512-NEXT:    vpermt2d %ymm5, %ymm6, %ymm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm11 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm1 = mem[0,1,1,3]
@@ -13172,16 +13181,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermt2d %xmm1, %xmm10, %xmm2
+; AVX512-NEXT:    vpermt2d %ymm1, %ymm6, %ymm2
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm22
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm2 = mem[0,1,1,3]
@@ -13192,22 +13201,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm17
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm30 = mem[0,1,1,3]
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm4 = mem[0,1,1,3]
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm3 = mem[0,1,1,3]
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm2 = mem[0,1,1,3]
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5],ymm8[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -13216,307 +13228,307 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3]
+; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpermt2d %xmm0, %xmm10, %xmm1
+; AVX512-NEXT:    vpermt2d %ymm0, %ymm6, %ymm1
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm12
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm0 = mem[0,1,1,3]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[0,1,1,3]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm22 = mem[0,1,1,3]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm5
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm28 = mem[0,1,1,3]
 ; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm29 = mem[0,1,1,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm28[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm27 = mem[0,1,1,3]
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm25 = mem[0,1,1,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm27[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5 {%k1}
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT:    vmovdqa %xmm10, %xmm1
-; AVX512-NEXT:    vpermi2d %xmm2, %xmm8, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm19
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm24
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
-; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm31
-; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm21
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT:    vmovdqa %ymm6, %ymm0
+; AVX512-NEXT:    vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm24
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm31
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
 ; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm1 = mem[0,1,1,3]
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm0 = mem[0,1,1,3]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm0[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm0 = mem[0,1,1,3]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm10 = mem[0,1,1,3]
-; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm2 = mem[0,1,1,3]
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
-; AVX512-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm13
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
-; AVX512-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3]
-; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm9 = xmm9[0,1],mem[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm5 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm5
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm22[1,1,1,1]
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm17
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm22
+; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm4 = xmm4[0,1],mem[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm7
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm9
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm20[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
+; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm6
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm5
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm31[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3]
+; AVX512-NEXT:    vmovdqa %ymm7, %ymm12
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm7
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm24[0],xmm7[1],xmm24[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm15[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3]
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
+; AVX512-NEXT:    vmovdqa %ymm6, %ymm14
+; AVX512-NEXT:    vmovdqa %ymm11, %ymm6
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm2 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm27
+; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm30
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm3 = mem[3,1,2,3,7,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm25
-; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm13 = mem[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm4 = mem[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm21
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
 ; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm2 = mem[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm21
-; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm14 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm19
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm11 = mem[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm13 = mem[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm10 = mem[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3]
-; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm23
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm11
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm22[2],xmm11[3],xmm22[3]
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3]
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm19
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm30
+; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm17
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm16
+; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm18
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm18[2],xmm16[3],xmm18[3]
 ; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm20
-; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm10 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm22
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm4
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3]
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm29
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
-; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm0 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm25[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm15
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm15 {%k1}
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,11,0,0]
+; AVX512-NEXT:    vpermt2d %ymm14, %ymm2, %ymm6
+; AVX512-NEXT:    vpermt2d %ymm22, %ymm2, %ymm11
+; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm25
+; AVX512-NEXT:    vpermt2d %ymm18, %ymm2, %ymm16
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm22
+; AVX512-NEXT:    vpermi2d %ymm12, %ymm31, %ymm2
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm12[2],xmm31[3],xmm12[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm24[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
+; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm16
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm4[2,3]
+; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm4 = mem[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm5 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm2 = mem[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm31
-; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm2 = mem[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm12 = mem[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm11 = mem[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm1
-; AVX512-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm1 = xmm6[0,1],mem[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT:    vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm7 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm6
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
 ; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm6
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm6
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm3
-; AVX512-NEXT:    vpermt2d %xmm23, %xmm16, %xmm3
-; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm3 = xmm3[0,1],mem[2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm6
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm1
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm3
-; AVX512-NEXT:    vpermt2d %xmm30, %xmm16, %xmm3
-; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm3 = xmm3[0,1],mem[2,3]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm3
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1}
-; AVX512-NEXT:    vpermt2d %xmm29, %xmm16, %xmm4
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm7
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm6
+; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm7 = xmm6[0,1],mem[2,3]
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm6
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm6
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5],ymm10[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm7, %zmm1, %zmm1
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm6
+; AVX512-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm7 = xmm6[0,1],mem[2,3]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm7, %zmm0, %zmm7
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm6
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm7 {%k1}
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm16[2],xmm24[2],xmm16[3],xmm24[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm4
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-NEXT:    vmovaps %zmm3, 64(%rsi)
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
@@ -13544,8 +13556,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovaps %zmm3, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm2, 64(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
@@ -13555,21 +13566,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-FCP-LABEL: load_i16_stride8_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $2312, %rsp # imm = 0x908
+; AVX512-FCP-NEXT:    subq $2440, %rsp # imm = 0x988
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
 ; AVX512-FCP-NEXT:    vmovdqa 368(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vmovdqa 336(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
-; AVX512-FCP-NEXT:    vmovdqa %xmm14, %xmm0
-; AVX512-FCP-NEXT:    vpermt2d %xmm2, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm4
 ; AVX512-FCP-NEXT:    vmovdqa 304(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %xmm2
@@ -13581,9 +13593,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm30
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm17
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa 480(%rdi), %ymm1
@@ -13607,12 +13619,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
@@ -13630,24 +13642,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512-FCP-NEXT:    vpermt2d %xmm3, %xmm5, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm19
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm9
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm31
+; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13659,22 +13672,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm28
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm5[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -13684,211 +13697,216 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 864(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vmovdqa 848(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 832(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, %xmm2
-; AVX512-FCP-NEXT:    vpermt2d %xmm3, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm9, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
 ; AVX512-FCP-NEXT:    vmovdqa 816(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 800(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512-FCP-NEXT:    vmovdqa 800(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vmovdqa 784(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 768(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm25
+; AVX512-FCP-NEXT:    vmovdqa 768(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm9
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm12
 ; AVX512-FCP-NEXT:    vmovdqa 992(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 960(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512-FCP-NEXT:    vmovdqa 928(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 896(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm1[5],ymm10[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1}
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa 624(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 608(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vmovdqa 592(%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm29
-; AVX512-FCP-NEXT:    vpermt2d %xmm11, %xmm2, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm11, %xmm16
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512-FCP-NEXT:    vpermt2d %ymm10, %ymm9, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm17
 ; AVX512-FCP-NEXT:    vmovdqa 560(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 544(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT:    vmovdqa 544(%rdi), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vmovdqa 528(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm11, %xmm21
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm22
+; AVX512-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm16
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm1[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa 736(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 704(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512-FCP-NEXT:    vmovdqa 672(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 640(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
-; AVX512-FCP-NEXT:    vpermt2d %xmm17, %xmm13, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm30
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
-; AVX512-FCP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
-; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm12
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm9
-; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpermt2d %xmm17, %xmm13, %xmm9
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm11
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm9
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm11
-; AVX512-FCP-NEXT:    vpermt2d %xmm25, %xmm13, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm8
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm19
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
-; AVX512-FCP-NEXT:    vpermt2d %xmm21, %xmm13, %xmm3
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm4
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm19
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [1,9,0,0]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm9
+; AVX512-FCP-NEXT:    vpermt2d %ymm29, %ymm10, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm15
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm24[0],xmm11[1],xmm24[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
+; AVX512-FCP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
+; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm0, %zmm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm9 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm10
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpermt2d %ymm18, %ymm15, %ymm10
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm25[0],xmm27[0],xmm25[1],xmm27[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm0
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm21
+; AVX512-FCP-NEXT:    vpermt2d %ymm23, %ymm15, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm8
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm28[0],xmm8[0],xmm28[1],xmm8[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512-FCP-NEXT:    vpermt2d %ymm16, %ymm15, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm22
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm31[0],xmm17[0],xmm31[1],xmm17[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm15
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm14[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa %xmm14, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3]
+; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm0
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm24[2],xmm11[3],xmm24[3]
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
-; AVX512-FCP-NEXT:    vpermt2d %xmm24, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm28
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,2,10]
+; AVX512-FCP-NEXT:    vpermt2d %ymm24, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm5
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm16[2],xmm29[2],xmm16[3],xmm29[3]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
 ; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm3 = mem[3,1,2,3,7,5,6,7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm28
 ; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm4 = mem[3,1,2,3,7,5,6,7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -13896,25 +13914,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm2, %xmm5, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm12, %xmm25
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm26 = xmm25[2],xmm2[2],xmm25[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm18[2],xmm13[3],xmm18[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm25
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm23
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm18
-; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm11 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm12 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd $231, (%rsp), %ymm14 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm14 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm10 = mem[3,1,2,3,7,5,6,7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -13923,20 +13941,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm8, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm9, %xmm19
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm11, %xmm20
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm17 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm8, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm21[2],xmm9[3],xmm21[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm19
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm24
-; AVX512-FCP-NEXT:    vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm11 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm8 = mem[3,1,2,3,7,5,6,7]
@@ -13947,10 +13966,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa %xmm6, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm16, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3]
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm0
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm6[2],xmm15[2],xmm6[3],xmm15[3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm15, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm20
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm20[2],xmm22[3],xmm20[3]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,1,3,4,5,5,7]
@@ -13970,55 +13991,56 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm1
-; AVX512-FCP-NEXT:    vpermt2d %xmm28, %xmm16, %xmm1
-; AVX512-FCP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm0 = xmm1[0,1],mem[2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,11,3,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512-FCP-NEXT:    vpermt2d %ymm29, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512-FCP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm3
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm3
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm15
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm1
-; AVX512-FCP-NEXT:    vpermt2d %xmm23, %xmm16, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
+; AVX512-FCP-NEXT:    vpermt2d %ymm25, %ymm16, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm3
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm0
-; AVX512-FCP-NEXT:    vpermt2d %xmm20, %xmm16, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT:    vpermt2d %ymm21, %ymm16, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm1
-; AVX512-FCP-NEXT:    vpermt2d %xmm21, %xmm16, %xmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512-FCP-NEXT:    vpermt2d %ymm20, %ymm16, %ymm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
@@ -14033,21 +14055,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm9, %xmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
-; AVX512-FCP-NEXT:    vpermt2d %xmm4, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm29
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm19
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,0,8]
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
@@ -14070,7 +14093,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,1,3]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -14085,18 +14108,19 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm27
-; AVX512-FCP-NEXT:    vpermt2d %xmm6, %xmm5, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm18
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm22
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm24
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm22
+; AVX512-FCP-NEXT:    vpermt2d %ymm6, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm31
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm27
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,1,3]
@@ -14104,12 +14128,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,1,3]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm26
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm3[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,1,3]
@@ -14133,29 +14156,29 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm31
-; AVX512-FCP-NEXT:    vpermt2d %xmm4, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm20
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm21
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm23
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm12
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm13
 ; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,1,3]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,1,3]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
@@ -14171,7 +14194,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -14179,20 +14202,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, %xmm1
-; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm3, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm17
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm16
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm28
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm30
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm28
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm29
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm14, %ymm25
+; AVX512-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm1[2,3]
 ; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,1,3]
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -14201,8 +14225,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,1,3]
@@ -14213,41 +14238,42 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm13, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
-; AVX512-FCP-NEXT:    vpermt2d %xmm19, %xmm15, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm15 = [1,9,0,0]
+; AVX512-FCP-NEXT:    vpermt2d %ymm9, %ymm15, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm24
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm18[0],xmm9[1],xmm18[1]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT:    vpshuflw $212, (%rsp), %ymm13 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm1
-; AVX512-FCP-NEXT:    vpermt2d %xmm24, %xmm15, %xmm1
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
-; AVX512-FCP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
+; AVX512-FCP-NEXT:    vpermt2d %ymm27, %ymm15, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa %ymm15, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm15
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm22[0],xmm15[0],xmm22[1],xmm15[1]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm13
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
@@ -14255,13 +14281,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm0
-; AVX512-FCP-NEXT:    vpermt2d %xmm25, %xmm15, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm13
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm21
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm12
+; AVX512-FCP-NEXT:    vpermt2d %ymm30, %ymm14, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm13
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm21[0],xmm13[0],xmm21[1],xmm13[1]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -14270,12 +14297,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa %xmm15, %xmm6
-; AVX512-FCP-NEXT:    vpermi2d %xmm28, %xmm30, %xmm6
-; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa %ymm14, %ymm6
+; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm25, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm20
+; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm28[0],xmm29[0],xmm28[1],xmm29[1]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm4
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -14285,27 +14313,28 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa %xmm9, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
-; AVX512-FCP-NEXT:    vpermt2d %xmm29, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3]
+; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm0
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm28 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,2,10]
+; AVX512-FCP-NEXT:    vpermt2d %ymm18, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm6
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
 ; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm31
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm3 = mem[3,1,2,3,7,5,6,7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
 ; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm4 = mem[3,1,2,3,7,5,6,7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -14313,135 +14342,141 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm18, %xmm5, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm19
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm18
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa %ymm15, %ymm2
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm26 = xmm22[2],xmm2[2],xmm22[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm15, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm16
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm16[2],xmm9[3],xmm16[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm22
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm4 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm19
 ; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm10 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm11 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm0
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm13, %xmm5, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm22
-; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm29
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm1
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm18 = xmm21[2],xmm1[2],xmm21[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm13, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm21
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm21[2],xmm30[2],xmm21[3],xmm30[3]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm21
-; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm14 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm6 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm4 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm7, %xmm25
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm27
-; AVX512-FCP-NEXT:    vpermi2d %xmm7, %xmm17, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm7 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm5 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm3 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm7 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm14 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4],ymm2[5],ymm13[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm17
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1}
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [3,11,3,3]
+; AVX512-FCP-NEXT:    vpermt2d %ymm6, %ymm4, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm6
+; AVX512-FCP-NEXT:    vpermt2d %ymm16, %ymm4, %ymm9
+; AVX512-FCP-NEXT:    vpermt2d %ymm30, %ymm4, %ymm21
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm30
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpermi2d %ymm20, %ymm1, %ymm4
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpermi2d %ymm20, %ymm21, %ymm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm12 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm2 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
-; AVX512-FCP-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm1 = xmm0[0,1],mem[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm16
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm0
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm0
-; AVX512-FCP-NEXT:    vpermt2d %xmm18, %xmm17, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm6
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm1
-; AVX512-FCP-NEXT:    vpermt2d %xmm22, %xmm17, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm4
-; AVX512-FCP-NEXT:    vpermt2d %xmm28, %xmm17, %xmm4
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm21[2],xmm20[2],xmm21[3],xmm20[3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm2, 64(%rsi)
@@ -14469,52 +14504,52 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm2, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512-FCP-NEXT:    addq $2312, %rsp # imm = 0x908
+; AVX512-FCP-NEXT:    addq $2440, %rsp # imm = 0x988
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride8_vf64:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    subq $2408, %rsp # imm = 0x968
-; AVX512DQ-NEXT:    vmovdqa 368(%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
+; AVX512DQ-NEXT:    vmovdqa 368(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT:    vmovdqa 336(%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-NEXT:    vmovdqa 336(%rdi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpermt2d %xmm0, %xmm6, %xmm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %ymm1, %ymm5
+; AVX512DQ-NEXT:    vpermt2d %ymm3, %ymm1, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa 304(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 272(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm31
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa 480(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -14537,12 +14572,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
-; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
@@ -14567,7 +14602,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpermt2d %xmm1, %xmm6, %xmm2
+; AVX512DQ-NEXT:    vpermt2d %ymm1, %ymm5, %ymm2
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm6
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm3
@@ -14577,10 +14613,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm5
 ; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm17
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm17
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -14627,11 +14663,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpermt2d %xmm0, %xmm6, %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm6, %xmm10
+; AVX512DQ-NEXT:    vpermt2d %ymm0, %ymm6, %ymm1
+; AVX512DQ-NEXT:    vmovdqa %ymm6, %ymm10
 ; AVX512DQ-NEXT:    vmovdqa 816(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 800(%rdi), %xmm2
@@ -14643,8 +14679,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm19
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm20
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm20
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa 992(%rdi), %ymm1
@@ -14686,11 +14722,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpermt2d %xmm1, %xmm10, %xmm2
+; AVX512DQ-NEXT:    vpermt2d %ymm1, %ymm10, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa 560(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 544(%rdi), %xmm5
@@ -14705,12 +14741,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3]
 ; AVX512DQ-NEXT:    vmovdqa 736(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm1[0,1,0,2]
 ; AVX512DQ-NEXT:    vmovdqa 704(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm30[0,1,2,0,4,5,6,4]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
@@ -14720,10 +14756,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2]
 ; AVX512DQ-NEXT:    vmovdqa 640(%rdi), %ymm5
 ; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm5[0,1,0,2]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm29[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7]
@@ -14731,7 +14767,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
@@ -14759,7 +14795,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3]
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm9 = xmm9[0,1],mem[2,3]
@@ -14767,9 +14803,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm9
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm9
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
-; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vpblendd $12, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
@@ -14781,8 +14817,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa %xmm8, %xmm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
@@ -14796,7 +14832,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm16
+; AVX512DQ-NEXT:    vmovdqa64 %ymm14, %ymm16
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -14835,14 +14871,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm22
+; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm22
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm24
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm9[2],xmm19[3],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm24
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm20
@@ -14862,10 +14898,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm18
-; AVX512DQ-NEXT:    vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm18
+; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm3 = xmm0[0,1],mem[2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm30[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm5 = mem[0,1,1,3,4,5,5,7]
@@ -14873,17 +14909,17 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm29[3,1,2,3,7,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm1
-; AVX512DQ-NEXT:    vpermt2d %xmm16, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm16
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [3,11,0,0]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm1
+; AVX512DQ-NEXT:    vpermt2d %ymm16, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm16
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm0 = xmm1[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -14899,8 +14935,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512DQ-NEXT:    vpermt2d %xmm22, %xmm16, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm1
+; AVX512DQ-NEXT:    vpermt2d %ymm22, %ymm16, %ymm1
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm3
@@ -14914,8 +14950,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm0
-; AVX512DQ-NEXT:    vpermt2d %xmm24, %xmm16, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-NEXT:    vpermt2d %ymm24, %ymm16, %ymm0
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -14927,7 +14963,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vpermt2d %xmm18, %xmm16, %xmm15
+; AVX512DQ-NEXT:    vpermt2d %ymm18, %ymm16, %ymm15
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm1 = xmm15[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -14944,27 +14980,26 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4]
-; AVX512DQ-NEXT:    vpermt2d %xmm5, %xmm10, %xmm6
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm28
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [0,0,0,8]
+; AVX512DQ-NEXT:    vpermt2d %ymm5, %ymm6, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm11 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm1 = mem[0,1,1,3]
@@ -15007,16 +15042,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpermt2d %xmm1, %xmm10, %xmm2
+; AVX512DQ-NEXT:    vpermt2d %ymm1, %ymm6, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm2 = mem[0,1,1,3]
@@ -15027,22 +15062,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm17
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm30 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm4 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm3 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5],ymm8[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -15051,307 +15089,307 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpermt2d %xmm0, %xmm10, %xmm1
+; AVX512DQ-NEXT:    vpermt2d %ymm0, %ymm6, %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm12
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm0 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm22 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm5
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm28 = mem[0,1,1,3]
 ; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm29 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm28[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm27 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm25 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm27[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5 {%k1}
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm10, %xmm1
-; AVX512DQ-NEXT:    vpermi2d %xmm2, %xmm8, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm19
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm24
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm31
-; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm21
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vmovdqa %ymm6, %ymm0
+; AVX512DQ-NEXT:    vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm24
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm31
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm1 = mem[0,1,1,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,1,1,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm0[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,1,1,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm10 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
-; AVX512DQ-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm13
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3]
-; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm9 = xmm9[0,1],mem[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm5 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm5
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm22[1,1,1,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm17
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm22
+; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm4 = xmm4[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm7
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm20[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
+; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm5
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm31[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3]
+; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm12
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm7
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm24[0],xmm7[1],xmm24[1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm15[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
+; AVX512DQ-NEXT:    vmovdqa %ymm6, %ymm14
+; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm6
 ; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512DQ-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm2 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm27
+; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm30
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm3 = mem[3,1,2,3,7,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm25
-; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm13 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm4 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm21
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm2 = mem[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm21
-; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm14 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm19
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm11 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm13 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm10 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm23
-; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm19
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm30
-; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm20
-; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm10 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm22
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm4
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm29
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm0 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm11
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm22[2],xmm11[3],xmm22[3]
+; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm17
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm16
+; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm18
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm18[2],xmm16[3],xmm18[3]
+; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm25[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm15
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm15 {%k1}
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,11,0,0]
+; AVX512DQ-NEXT:    vpermt2d %ymm14, %ymm2, %ymm6
+; AVX512DQ-NEXT:    vpermt2d %ymm22, %ymm2, %ymm11
+; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm25
+; AVX512DQ-NEXT:    vpermt2d %ymm18, %ymm2, %ymm16
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm22
+; AVX512DQ-NEXT:    vpermi2d %ymm12, %ymm31, %ymm2
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm12[2],xmm31[3],xmm12[3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm24[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm16
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm4[2,3]
+; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm5 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm2 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm31
-; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm2 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm12 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm11 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm1
-; AVX512DQ-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm1 = xmm6[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT:    vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm7 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm6
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm6
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm6
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm3
-; AVX512DQ-NEXT:    vpermt2d %xmm23, %xmm16, %xmm3
-; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm3 = xmm3[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm6
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm3
-; AVX512DQ-NEXT:    vpermt2d %xmm30, %xmm16, %xmm3
-; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm3 = xmm3[0,1],mem[2,3]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm3
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vpermt2d %xmm29, %xmm16, %xmm4
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm7
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm6
+; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm7 = xmm6[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm6
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm6
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5],ymm10[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm7, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm6
+; AVX512DQ-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm7 = xmm6[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm7, %zmm0, %zmm7
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm6
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm7 {%k1}
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm16[2],xmm24[2],xmm16[3],xmm24[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm4
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm3, 64(%rsi)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
@@ -15379,8 +15417,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovaps %zmm3, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rax)
@@ -15390,21 +15427,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $2312, %rsp # imm = 0x908
+; AVX512DQ-FCP-NEXT:    subq $2440, %rsp # imm = 0x988
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 368(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 336(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm14, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm2, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa 304(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %xmm2
@@ -15416,9 +15454,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 480(%rdi), %ymm1
@@ -15442,12 +15480,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
@@ -15465,24 +15503,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm3, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -15494,22 +15533,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm28
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm5[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -15519,211 +15558,216 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 864(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 848(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 832(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, %xmm2
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm3, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa 816(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 800(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 800(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 784(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 768(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa 768(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm9
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa 992(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 960(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 928(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 896(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm1[5],ymm10[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa 624(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 608(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 592(%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm29
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm11, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm11, %xmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm10, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa 560(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 544(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 544(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 528(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm11, %xmm21
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm22
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm16
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 736(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 704(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 672(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 640(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm17, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm30
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm17, %xmm13, %xmm9
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm11
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm25, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm19
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm21, %xmm13, %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm19
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm9
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm29, %ymm10, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm24[0],xmm11[1],xmm24[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm0, %zmm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm18, %ymm15, %ymm10
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm25[0],xmm27[0],xmm25[1],xmm27[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm21
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm23, %ymm15, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm8
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm28[0],xmm8[0],xmm28[1],xmm8[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm16, %ymm15, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm22
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm31[0],xmm17[0],xmm31[1],xmm17[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm14[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm14, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm24[2],xmm11[3],xmm24[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm24, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm28
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm24, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm16[2],xmm29[2],xmm16[3],xmm29[3]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
 ; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[3,1,2,3,7,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm28
 ; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[3,1,2,3,7,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -15731,25 +15775,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm2, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm12, %xmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm26 = xmm25[2],xmm2[2],xmm25[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm18[2],xmm13[3],xmm18[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm23
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm18
-; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm11 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm12 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd $231, (%rsp), %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm14 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm10 = mem[3,1,2,3,7,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -15758,20 +15802,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm8, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm9, %xmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm11, %xmm20
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm17 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm8, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm21[2],xmm9[3],xmm21[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm19
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm24
-; AVX512DQ-FCP-NEXT:    vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm11 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm8 = mem[3,1,2,3,7,5,6,7]
@@ -15782,10 +15827,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm16, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm6[2],xmm15[2],xmm6[3],xmm15[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm15, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm20[2],xmm22[3],xmm20[3]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,1,3,4,5,5,7]
@@ -15805,55 +15852,56 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm28, %xmm16, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm0 = xmm1[0,1],mem[2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm29, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-FCP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm15
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm23, %xmm16, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm25, %ymm16, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm20, %xmm16, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm21, %ymm16, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm21, %xmm16, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm20, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
@@ -15868,21 +15916,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm4, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm29
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,0,8]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
@@ -15905,7 +15954,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,1,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -15920,18 +15969,19 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm27
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm6, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm18
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm22
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm31
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,1,3]
@@ -15939,12 +15989,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,1,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm26
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm3[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,1,3]
@@ -15968,29 +16017,29 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm31
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm4, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm20
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm23
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm12
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm13
 ; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,1,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,1,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
@@ -16006,7 +16055,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -16014,20 +16063,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm3, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm16
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm13, %xmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm28
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm29
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm14, %ymm25
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,1,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -16036,8 +16086,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512DQ-FCP-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,1,3]
@@ -16048,41 +16099,42 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm13, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm19, %xmm15, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm15 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm9, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm24
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm18[0],xmm9[1],xmm18[1]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw $212, (%rsp), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm24, %xmm15, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm27, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm15, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm15
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm22[0],xmm15[0],xmm22[1],xmm15[1]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
@@ -16090,13 +16142,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm25, %xmm15, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm13
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm12
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm30, %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm21[0],xmm13[0],xmm21[1],xmm13[1]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -16105,12 +16158,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm15, %xmm6
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm28, %xmm30, %xmm6
-; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm14, %ymm6
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm4, %ymm25, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm20
+; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm28[0],xmm29[0],xmm28[1],xmm29[1]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -16120,27 +16174,28 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm29, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm28 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm18, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm6
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
 ; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm31
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[3,1,2,3,7,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
 ; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[3,1,2,3,7,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -16148,135 +16203,141 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm18, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm15, %ymm2
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm26 = xmm22[2],xmm2[2],xmm22[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm15, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm16
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm16[2],xmm9[3],xmm16[3]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm22
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm4 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm19
 ; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm10 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm11 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm13, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm18 = xmm21[2],xmm1[2],xmm21[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm13, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm21
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm21[2],xmm30[2],xmm21[3],xmm30[3]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm21
-; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm14 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm6 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm4 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm27
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm7, %xmm17, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm7 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm5 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm3 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm7 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm14 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4],ymm2[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm17
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1}
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm4, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm6
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm16, %ymm4, %ymm9
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm30, %ymm4, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm20, %ymm1, %ymm4
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm20, %ymm21, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm12 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm1 = xmm0[0,1],mem[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm18, %xmm17, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm22, %xmm17, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm4
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm28, %xmm17, %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm21[2],xmm20[2],xmm21[3],xmm20[3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 64(%rsi)
@@ -16304,13 +16365,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-FCP-NEXT:    addq $2312, %rsp # imm = 0x908
+; AVX512DQ-FCP-NEXT:    addq $2440, %rsp # imm = 0x988
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 34f23213500c1..c41e306630c0e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -228,20 +228,19 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX-LABEL: load_i32_stride3_vf4:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rdi), %xmm0
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm1
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,3,2,1]
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3]
+; AVX-NEXT:    vmovaps 32(%rdi), %xmm0
+; AVX-NEXT:    vmovaps (%rdi), %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT:    vmovaps 16(%rdi), %xmm3
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,3,2,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0],xmm4[1,2],xmm3[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0,3,2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; AVX-NEXT:    vmovaps %xmm3, (%rsi)
+; AVX-NEXT:    vmovaps %xmm2, (%rsi)
 ; AVX-NEXT:    vmovaps %xmm4, (%rdx)
 ; AVX-NEXT:    vmovaps %xmm0, (%rcx)
 ; AVX-NEXT:    retq
@@ -477,8 +476,8 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX-NEXT:    vmovaps (%rdi), %ymm2
 ; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm4
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
+; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],mem[1,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
@@ -818,16 +817,16 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 32(%rdi), %ymm4
 ; AVX-NEXT:    vmovaps (%rdi), %ymm6
 ; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7]
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm7
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,1],ymm4[1,3],ymm7[6,5],ymm4[5,7]
+; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm7
+; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[2,0],mem[1,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm7[0,2],ymm5[4,7],ymm7[4,6]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX-NEXT:    vmovaps 112(%rdi), %xmm9
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm1[1,3],ymm9[6,5],ymm1[5,7]
+; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm9
+; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,0],mem[1,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4]
@@ -1406,135 +1405,139 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i32_stride3_vf32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $392, %rsp # imm = 0x188
+; AVX-NEXT:    subq $456, %rsp # imm = 0x1C8
 ; AVX-NEXT:    vmovaps 256(%rdi), %ymm2
-; AVX-NEXT:    vmovaps 224(%rdi), %ymm7
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm3
+; AVX-NEXT:    vmovaps 224(%rdi), %ymm3
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 352(%rdi), %ymm4
-; AVX-NEXT:    vmovaps 320(%rdi), %ymm5
-; AVX-NEXT:    vmovaps 288(%rdi), %ymm6
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 160(%rdi), %ymm10
-; AVX-NEXT:    vmovaps 128(%rdi), %ymm9
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7]
-; AVX-NEXT:    vmovaps 112(%rdi), %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7]
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 352(%rdi), %ymm6
+; AVX-NEXT:    vmovaps 320(%rdi), %ymm9
+; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 288(%rdi), %ymm8
+; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 160(%rdi), %ymm11
+; AVX-NEXT:    vmovaps 128(%rdi), %ymm5
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm10
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6],ymm5[7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups %ymm10, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4]
-; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm11[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm7[2,0],ymm11[5,4],ymm7[6,4]
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7]
-; AVX-NEXT:    vmovaps 304(%rdi), %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm4[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6],ymm7[7]
-; AVX-NEXT:    vmovaps 208(%rdi), %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm7[1,3],ymm1[6,5],ymm7[5,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps %ymm2, %ymm4
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 32(%rdi), %ymm15
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7]
-; AVX-NEXT:    vmovaps (%rdi), %ymm2
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm7
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm8[6,7]
+; AVX-NEXT:    vmovaps (%rdi), %ymm3
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0],ymm15[1],ymm3[2,3],ymm15[4],ymm3[5,6],ymm15[7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT:    vmovaps 64(%rdi), %ymm9
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm9[1,0],ymm0[2,0],ymm9[5,4],ymm0[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm13[6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups %ymm11, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm11[3,0],ymm10[6,4],ymm11[7,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm11[0,0],ymm8[2,0],ymm11[4,4],ymm8[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[2,0],ymm7[3,0],ymm11[6,4],ymm7[7,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,0],ymm2[2,0],ymm7[4,4],ymm2[6,4]
 ; AVX-NEXT:    vmovaps 112(%rdi), %xmm13
-; AVX-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm13[0,3],ymm6[5,6],ymm13[4,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[1,2],ymm13[0,3],ymm8[5,6],ymm13[4,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm2[5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[2,0],ymm14[3,0],ymm4[6,4],ymm14[7,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm14[0,0],ymm6[2,0],ymm14[4,4],ymm6[6,4]
-; AVX-NEXT:    vmovaps 304(%rdi), %xmm8
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm14[3,0],ymm6[6,4],ymm14[7,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[0,0],ymm2[2,0],ymm14[4,4],ymm2[6,4]
+; AVX-NEXT:    vmovaps 304(%rdi), %xmm11
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm11[0,3],ymm10[5,6],ymm11[4,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm6[5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm0[3,0],ymm7[6,4],ymm0[7,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4]
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm11
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm11[0,3],ymm4[5,6],ymm11[4,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm6[5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm2[5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm12[0,0],ymm4[2,0],ymm12[4,4],ymm4[6,4]
-; AVX-NEXT:    vmovaps 208(%rdi), %xmm10
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[2,0],ymm0[3,0],ymm9[6,4],ymm0[7,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm2[2,0],ymm0[4,4],ymm2[6,4]
+; AVX-NEXT:    vmovaps 16(%rdi), %xmm10
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm13[1,0],ymm6[2,0],ymm13[5,4],ymm6[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm9[0,3],ymm6[6,4],ymm9[4,7]
-; AVX-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
-; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm9 = ymm9[0,1],mem[0,3],ymm9[4,5],mem[4,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[2,0],ymm5[0,3],ymm8[6,4],ymm5[4,7]
-; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm8 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm12[3,0],ymm4[6,4],ymm12[7,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[0,0],ymm2[2,0],ymm12[4,4],ymm2[6,4]
+; AVX-NEXT:    vmovaps 208(%rdi), %xmm6
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm6[0,3],ymm4[5,6],ymm6[4,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vblendps $36, (%rsp), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm4 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm4[2,0],ymm13[5,4],ymm4[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm3[0,3],ymm4[6,4],ymm3[4,7]
+; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm7 = ymm7[0,1],mem[0,3],ymm7[4,5],mem[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm11[1,0],ymm7[2,0],ymm11[5,4],ymm7[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm7[2,0],ymm5[0,3],ymm7[6,4],ymm5[4,7]
+; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm7 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7]
 ; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm3 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm3[2,0],ymm11[5,4],ymm3[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm3[2,0],ymm10[5,4],ymm3[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm15[0,3],ymm3[6,4],ymm15[4,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm7[0,3],ymm0[4,5],ymm7[4,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[0,3],ymm0[4,5],ymm9[4,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,3],ymm1[6,4],ymm2[4,7]
-; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT:    vmovaps %ymm1, %ymm3
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm1[2,0],ymm6[5,4],ymm1[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[0,3],ymm1[6,4],ymm3[4,7]
+; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm2, (%rsi)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm2, 64(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 96(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%rsi)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm2, 96(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm2, 32(%rsi)
-; AVX-NEXT:    vmovaps %ymm4, 64(%rdx)
+; AVX-NEXT:    vmovaps %ymm2, 64(%rdx)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm2, (%rdx)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -1544,8 +1547,8 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %ymm1, 64(%rcx)
 ; AVX-NEXT:    vmovaps %ymm0, (%rcx)
 ; AVX-NEXT:    vmovaps %ymm5, 96(%rcx)
-; AVX-NEXT:    vmovaps %ymm6, 32(%rcx)
-; AVX-NEXT:    addq $392, %rsp # imm = 0x188
+; AVX-NEXT:    vmovaps %ymm4, 32(%rcx)
+; AVX-NEXT:    addq $456, %rsp # imm = 0x1C8
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -2575,37 +2578,37 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i32_stride3_vf64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $1384, %rsp # imm = 0x568
+; AVX-NEXT:    subq $1512, %rsp # imm = 0x5E8
 ; AVX-NEXT:    vmovaps 544(%rdi), %ymm2
 ; AVX-NEXT:    vmovaps 512(%rdi), %ymm3
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 480(%rdi), %ymm4
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 352(%rdi), %ymm5
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups %ymm5, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 320(%rdi), %ymm6
 ; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 288(%rdi), %ymm7
 ; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 160(%rdi), %ymm8
 ; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 128(%rdi), %ymm9
-; AVX-NEXT:    vmovups %ymm9, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vmovaps 128(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 96(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7]
-; AVX-NEXT:    vmovaps 112(%rdi), %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
+; AVX-NEXT:    vmovaps %ymm9, %ymm8
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX-NEXT:    vmovaps 304(%rdi), %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm6[1,3],ymm1[6,5],ymm6[5,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm5[2,3,0,1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm14[2,0],ymm5[5,4],ymm14[6,4]
@@ -2613,8 +2616,8 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
-; AVX-NEXT:    vmovaps 496(%rdi), %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1]
@@ -2622,102 +2625,104 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 704(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 688(%rdi), %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm2[1,3],ymm0[6,5],ymm2[5,7]
+; AVX-NEXT:    vmovaps 704(%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 672(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT:    vmovaps 736(%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm15 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT:    vmovaps 736(%rdi), %ymm6
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm6[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 32(%rdi), %ymm7
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7]
-; AVX-NEXT:    vmovaps (%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6],ymm7[7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm4
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1]
+; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT:    vmovaps 64(%rdi), %ymm11
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm11[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[2,0],ymm11[5,4],ymm10[6,4]
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 224(%rdi), %ymm6
-; AVX-NEXT:    vmovaps 208(%rdi), %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7]
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT:    vmovaps 256(%rdi), %ymm5
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm1[2,0],ymm5[5,4],ymm1[6,4]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT:    vmovaps 256(%rdi), %ymm7
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm7[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm5[2,0],ymm7[5,4],ymm5[6,4]
+; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 416(%rdi), %ymm12
-; AVX-NEXT:    vmovaps 400(%rdi), %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7]
-; AVX-NEXT:    vmovaps 384(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 416(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT:    vmovaps 448(%rdi), %ymm8
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 384(%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT:    vmovaps 448(%rdi), %ymm12
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm12[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm9[2,0],ymm12[5,4],ymm9[6,4]
+; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 608(%rdi), %ymm10
-; AVX-NEXT:    vmovaps 592(%rdi), %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7]
-; AVX-NEXT:    vmovaps 576(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 608(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT:    vmovaps 640(%rdi), %ymm13
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm13[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm11[2,0],ymm13[5,4],ymm11[6,4]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 576(%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT:    vmovaps 640(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm1[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[2,0],ymm1[5,4],ymm13[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
-; AVX-NEXT:    vmovaps 112(%rdi), %xmm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[3,0],ymm0[6,4],ymm8[7,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm8[0,0],ymm0[2,0],ymm8[4,4],ymm0[6,4]
+; AVX-NEXT:    vmovaps 112(%rdi), %xmm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm0[0,3],ymm1[5,6],ymm0[4,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm4[0,3],ymm1[5,6],ymm4[4,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4]
-; AVX-NEXT:    vmovaps 304(%rdi), %xmm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 304(%rdi), %xmm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm4[0,3],ymm1[5,6],ymm4[4,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2734,124 +2739,132 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4]
-; AVX-NEXT:    vmovaps 688(%rdi), %xmm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm6[2,0],ymm2[3,0],ymm6[6,4],ymm2[7,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[2,0],ymm2[4,4],ymm0[6,4]
+; AVX-NEXT:    vmovaps 688(%rdi), %xmm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7]
+; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[2,0],ymm10[3,0],ymm11[6,4],ymm10[7,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,0],ymm0[2,0],ymm10[4,4],ymm0[6,4]
+; AVX-NEXT:    vmovaps 16(%rdi), %xmm6
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm4[0,3],ymm2[5,6],ymm4[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm6[0,3],ymm2[5,6],ymm6[4,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm2[3,0],ymm5[6,4],ymm2[7,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4]
-; AVX-NEXT:    vmovaps 208(%rdi), %xmm5
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[2,0],ymm5[3,0],ymm7[6,4],ymm5[7,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[0,0],ymm1[2,0],ymm5[4,4],ymm1[6,4]
+; AVX-NEXT:    vmovaps 208(%rdi), %xmm8
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm5[0,3],ymm3[5,6],ymm5[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm8[0,3],ymm3[5,6],ymm8[4,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm9[3,0],ymm8[6,4],ymm9[7,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[2,0],ymm9[3,0],ymm12[6,4],ymm9[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[0,0],ymm2[2,0],ymm9[4,4],ymm2[6,4]
-; AVX-NEXT:    vmovaps 400(%rdi), %xmm8
+; AVX-NEXT:    vmovaps 400(%rdi), %xmm12
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm8[0,3],ymm15[5,6],ymm8[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm12[0,3],ymm15[5,6],ymm12[4,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5,6,7]
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm13[2,0],ymm11[3,0],ymm13[6,4],ymm11[7,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm3[2,0],ymm11[4,4],ymm3[6,4]
-; AVX-NEXT:    vmovaps 592(%rdi), %xmm9
+; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm13[3,0],ymm3[6,4],ymm13[7,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm13[0,0],ymm3[2,0],ymm13[4,4],ymm3[6,4]
+; AVX-NEXT:    vmovaps 592(%rdi), %xmm13
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm10[2],ymm3[3,4],ymm10[5],ymm3[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm9[0,3],ymm14[5,6],ymm9[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
-; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups (%rsp), %ymm15 # 32-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm15[5,6,7]
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm14 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm11[1,0],ymm14[2,0],ymm11[5,4],ymm14[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm15[0,3],ymm14[6,4],ymm15[4,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm15 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[0,3],ymm0[6,4],ymm7[4,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm15[5,6,7]
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps %ymm0, %ymm15
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[0,3],ymm0[6,4],ymm15[4,7]
+; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm10 = ymm10[0,1],mem[0,3],ymm10[4,5],mem[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm11[0,1],mem[2],ymm11[3,4],mem[5],ymm11[6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[0,3],ymm0[6,4],ymm14[4,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[0,3],ymm0[6,4],ymm11[4,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm14 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[1,0],ymm0[2,0],ymm5[5,4],ymm0[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[0,3],ymm0[6,4],ymm6[4,7]
+; AVX-NEXT:    vshufps $196, (%rsp), %ymm4, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm11 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm8[1,0],ymm0[2,0],ymm8[5,4],ymm0[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[0,3],ymm0[6,4],ymm1[4,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm1 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,3],ymm1[6,4],ymm6[4,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm8[0,3],ymm1[6,4],ymm8[4,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm8 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT:    vmovaps %ymm2, %ymm4
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[1,0],ymm8[2,0],ymm12[5,4],ymm8[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[0,3],ymm2[6,4],ymm4[4,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm2[2],ymm12[3,4],ymm2[5],ymm12[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm12[0,3],ymm2[6,4],ymm12[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm6 = ymm7[0,1],mem[2],ymm7[3,4],mem[5],ymm7[6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[1,0],ymm6[2,0],ymm4[5,4],ymm6[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm7[0,3],ymm6[6,4],ymm7[4,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm6[0,3],ymm5[6,4],ymm6[4,7]
+; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT:    vmovaps %ymm3, %ymm4
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm13[1,0],ymm7[2,0],ymm13[5,4],ymm7[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[0,3],ymm3[6,4],ymm4[4,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[0,3],ymm3[6,4],ymm10[4,7]
-; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm4 = ymm11[0,1],mem[0,3],ymm11[4,5],mem[4,7]
+; AVX-NEXT:    vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm4 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm4, 192(%rsi)
@@ -2886,14 +2899,15 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm4, 32(%rdx)
 ; AVX-NEXT:    vmovaps %ymm3, 192(%rcx)
-; AVX-NEXT:    vmovaps %ymm5, 224(%rcx)
+; AVX-NEXT:    vmovaps %ymm6, 224(%rcx)
 ; AVX-NEXT:    vmovaps %ymm2, 128(%rcx)
 ; AVX-NEXT:    vmovaps %ymm1, 160(%rcx)
 ; AVX-NEXT:    vmovaps %ymm0, 64(%rcx)
-; AVX-NEXT:    vmovaps %ymm14, 96(%rcx)
-; AVX-NEXT:    vmovaps %ymm7, (%rcx)
-; AVX-NEXT:    vmovaps %ymm13, 32(%rcx)
-; AVX-NEXT:    addq $1384, %rsp # imm = 0x568
+; AVX-NEXT:    vmovaps %ymm11, 96(%rcx)
+; AVX-NEXT:    vmovaps %ymm10, (%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm0, 32(%rcx)
+; AVX-NEXT:    addq $1512, %rsp # imm = 0x5E8
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index c08442f9d9d01..571e1435b75f1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -359,25 +359,25 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
 ; AVX-NEXT:    vmovaps (%rdi), %xmm3
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,2,3,3]
-; AVX-NEXT:    vmovaps 64(%rdi), %xmm6
-; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[0]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm5
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2],xmm6[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[1,2,3,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[0]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
 ; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2],xmm7[3]
 ; AVX-NEXT:    vshufpd {{.*#+}} xmm7 = xmm7[1,0]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[1]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm5[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,0],mem[1,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[2]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
 ; AVX-NEXT:    vmovaps %xmm2, (%rsi)
-; AVX-NEXT:    vmovaps %xmm5, (%rdx)
+; AVX-NEXT:    vmovaps %xmm6, (%rdx)
 ; AVX-NEXT:    vmovaps %xmm7, (%rcx)
 ; AVX-NEXT:    vmovaps %xmm3, (%r8)
 ; AVX-NEXT:    vmovaps %xmm0, (%r9)
@@ -1011,23 +1011,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; AVX512-NEXT:    vpbroadcastd 144(%rdi), %ymm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512-NEXT:    vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512-NEXT:    vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512-NEXT:    vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512-NEXT:    vpermd %ymm5, %ymm6, %ymm6
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512-NEXT:    vpermd %ymm5, %ymm7, %ymm5
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512-NEXT:    vmovdqa %ymm3, (%rdx)
-; AVX512-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512-NEXT:    vmovdqa %ymm6, (%r8)
 ; AVX512-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1044,23 +1047,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; AVX512-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm4
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm7, %ymm5
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -1077,23 +1083,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; AVX512DQ-NEXT:    vpbroadcastd 144(%rdi), %ymm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-NEXT:    vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512DQ-NEXT:    vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512DQ-NEXT:    vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512DQ-NEXT:    vpermd %ymm5, %ymm6, %ymm6
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512DQ-NEXT:    vpermd %ymm5, %ymm7, %ymm5
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-NEXT:    vmovdqa %ymm3, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa %ymm6, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -1110,23 +1119,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm4
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -1143,23 +1155,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vpbroadcastd 144(%rdi), %ymm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512BW-NEXT:    vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512BW-NEXT:    vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512BW-NEXT:    vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512BW-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512BW-NEXT:    vpermd %ymm5, %ymm6, %ymm6
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512BW-NEXT:    vpermd %ymm5, %ymm7, %ymm5
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512BW-NEXT:    vmovdqa %ymm3, (%rdx)
-; AVX512BW-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512BW-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512BW-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512BW-NEXT:    vmovdqa %ymm6, (%r8)
 ; AVX512BW-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1176,23 +1191,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; AVX512BW-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm4
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512BW-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512BW-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm6
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512BW-FCP-NEXT:    vpermd %ymm5, %ymm7, %ymm5
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -1209,23 +1227,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; AVX512DQ-BW-NEXT:    vpbroadcastd 144(%rdi), %ymm4
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-BW-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512DQ-BW-NEXT:    vpermd %ymm5, %ymm6, %ymm6
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512DQ-BW-NEXT:    vpermd %ymm5, %ymm7, %ymm5
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%r8)
 ; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
@@ -1242,23 +1263,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm4
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm5, %ymm7, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
@@ -1563,9 +1587,9 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[3,0],ymm1[6,4],ymm4[7,4]
-; AVX-NEXT:    vmovaps 160(%rdi), %xmm9
-; AVX-NEXT:    vmovaps 192(%rdi), %xmm8
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm8[2,3]
+; AVX-NEXT:    vmovaps 192(%rdi), %xmm9
+; AVX-NEXT:    vmovaps 160(%rdi), %xmm8
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm9[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
@@ -1605,7 +1629,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm2[2,2],ymm9[6,4],ymm2[6,6]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3]
@@ -3082,11 +3106,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vmovaps 160(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 192(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 192(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 160(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7]
@@ -3098,11 +3122,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vmovaps 480(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 512(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 512(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 480(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -3130,11 +3154,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vmovaps 320(%rdi), %xmm3
+; AVX-NEXT:    vmovaps 352(%rdi), %xmm3
 ; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 320(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -3203,8 +3227,8 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm8[3,0],ymm4[4,4],ymm8[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6]
@@ -3215,8 +3239,8 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm14[3,0],ymm3[4,4],ymm14[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm14[2,2],ymm15[6,4],ymm14[6,6]
@@ -3239,8 +3263,8 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6]
@@ -6144,11 +6168,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vmovaps 160(%rdi), %xmm4
+; AVX-NEXT:    vmovaps 192(%rdi), %xmm4
 ; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 192(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 160(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6161,11 +6185,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vmovaps 480(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 512(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 512(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 480(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6178,11 +6202,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm1[1,3],ymm0[6,5],ymm1[5,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vmovaps 800(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 832(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 832(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 800(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6194,11 +6218,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vmovaps 1120(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 1152(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 1152(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 1120(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6226,11 +6250,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vmovaps 320(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 352(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 320(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6243,11 +6267,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7]
 ; AVX-NEXT:    vmovaps %ymm12, %ymm7
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vmovaps 640(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 672(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 672(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 640(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6261,11 +6285,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT:    vmovaps 960(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 992(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovaps 992(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 960(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6418,8 +6442,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm1[3,0],ymm5[4,4],ymm1[7,4]
@@ -6433,8 +6457,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4]
@@ -6448,8 +6472,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4]
@@ -6463,8 +6487,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,0],ymm1[3,0],ymm15[4,4],ymm1[7,4]
@@ -6477,8 +6501,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6]
@@ -6489,8 +6513,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm7, %ymm14
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm7[3,0],ymm9[4,4],ymm7[7,4]
@@ -6503,8 +6527,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm8, %ymm7
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm6[0,0],ymm8[3,0],ymm6[4,4],ymm8[7,4]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index 85ed61811af53..bd7730997a2e5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -469,13 +469,14 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovaps 32(%rdi), %ymm0
 ; AVX-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1],xmm3[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,3]
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT:    vmovaps (%rdi), %xmm3
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1],xmm2[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,3]
 ; AVX-NEXT:    vmovaps 64(%rdi), %xmm5
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[2]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[1,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[3,0],xmm3[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[1,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -491,11 +492,10 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm6[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[0,1,0,2]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
 ; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm5[2,3]
 ; AVX-NEXT:    vmovaps %xmm4, (%rsi)
 ; AVX-NEXT:    vmovaps %xmm2, (%rdx)
 ; AVX-NEXT:    vmovaps %xmm3, (%rcx)
@@ -983,74 +983,76 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-LABEL: load_i32_stride6_vf8:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovapd 160(%rdi), %ymm3
-; AVX-NEXT:    vmovapd 128(%rdi), %ymm4
+; AVX-NEXT:    vmovapd 160(%rdi), %ymm2
+; AVX-NEXT:    vmovapd 128(%rdi), %ymm3
 ; AVX-NEXT:    vmovaps 96(%rdi), %ymm0
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 64(%rdi), %ymm5
 ; AVX-NEXT:    vmovaps 32(%rdi), %ymm6
 ; AVX-NEXT:    vmovaps (%rdi), %ymm7
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm8
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm8[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,3]
-; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm9
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX-NEXT:    vmovaps (%rdi), %xmm8
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm4[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,3]
+; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm5, %ymm9
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm9[0,0],ymm0[6,4],ymm9[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm1[2,2],ymm10[6,4],ymm1[6,6]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm3[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[1],ymm10[3],ymm4[2]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm5[2,2],ymm10[6,4],ymm5[6,6]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm10[3,4,5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm2[0,1]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm10[0],ymm3[1],ymm10[3],ymm3[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm8[1,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm4[3,0],xmm8[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm8[2,0],xmm4[1,3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[3,0],ymm9[1,0],ymm0[7,4],ymm9[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm1[2,3],ymm8[6,4],ymm1[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[3,1],ymm4[1,3],ymm10[7,5],ymm4[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm5[2,3],ymm8[6,4],ymm5[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[3,1],ymm3[1,3],ymm10[7,5],ymm3[5,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm11
 ; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm7[2,0],xmm11[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm1[2,0],ymm0[6,5],ymm1[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm5[2,0],ymm0[6,5],ymm5[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5,6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,1],xmm11[3,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm0[3,1],ymm1[2,1],ymm0[7,5],ymm1[6,5]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm4[0,2],ymm8[2,0],ymm4[4,6],ymm8[6,4]
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm9
-; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm9[2,2,3,3]
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm11
-; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX-NEXT:    vmovapd 80(%rdi), %xmm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm5[2,1],ymm0[7,5],ymm5[6,5]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm2[2,0],ymm3[0,0],ymm2[6,4],ymm3[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm3[0,2],ymm7[2,0],ymm3[4,6],ymm7[6,4]
+; AVX-NEXT:    vmovaps 32(%rdi), %xmm8
+; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm8[2,2,3,3]
+; AVX-NEXT:    vmovaps 16(%rdi), %xmm10
+; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm11
+; AVX-NEXT:    vmovaps 80(%rdi), %xmm12
+; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm12[2,0],xmm11[0,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2,0],ymm0[4,5],ymm13[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm4[1,0],ymm3[7,4],ymm4[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,3],ymm3[2,0],ymm4[4,7],ymm3[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm9[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm1[1,3],ymm12[7,5],ymm1[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
-; AVX-NEXT:    vmovaps %ymm2, (%rsi)
-; AVX-NEXT:    vmovaps %ymm5, (%rdx)
+; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm3[1,0],ymm2[7,4],ymm3[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,3],ymm2[2,0],ymm3[4,7],ymm2[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm8[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm12[3,0],xmm11[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm8[2,0],ymm0[5,5],ymm8[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT:    vmovaps %ymm1, (%rsi)
+; AVX-NEXT:    vmovaps %ymm4, (%rdx)
 ; AVX-NEXT:    vmovaps %ymm6, (%rcx)
-; AVX-NEXT:    vmovaps %ymm7, (%r8)
-; AVX-NEXT:    vmovaps %ymm8, (%r9)
+; AVX-NEXT:    vmovaps %ymm5, (%r8)
+; AVX-NEXT:    vmovaps %ymm7, (%r9)
 ; AVX-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -1287,360 +1289,392 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-LABEL: load_i32_stride6_vf8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512-NEXT:    vpermd %zmm6, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512-NEXT:    vpermd %zmm6, %zmm3, %zmm3
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512-NEXT:    vmovdqa %ymm7, (%rsi)
-; AVX512-NEXT:    vmovdqa %ymm8, (%rdx)
-; AVX512-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX512-NEXT:    vmovdqa %ymm6, (%r9)
-; AVX512-NEXT:    vmovdqa %ymm4, (%rax)
+; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm7
+; AVX512-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512-NEXT:    vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-NEXT:    vpermd %ymm1, %ymm2, %ymm2
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512-NEXT:    vmovdqa %ymm8, (%rcx)
+; AVX512-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i32_stride6_vf8:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%r9)
-; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm7
+; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i32_stride6_vf8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512DQ-NEXT:    vmovdqa %ymm7, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %ymm8, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX512DQ-NEXT:    vmovdqa %ymm6, (%r9)
-; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rax)
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512DQ-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm7
+; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512DQ-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpermd %ymm1, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512DQ-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm8, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512DQ-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i32_stride6_vf8:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i32_stride6_vf8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512BW-NEXT:    vpermd %zmm6, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512BW-NEXT:    vpermd %zmm6, %zmm3, %zmm3
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512BW-NEXT:    vmovdqa %ymm7, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %ymm8, (%rdx)
-; AVX512BW-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512BW-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX512BW-NEXT:    vmovdqa %ymm6, (%r9)
-; AVX512BW-NEXT:    vmovdqa %ymm4, (%rax)
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512BW-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm7
+; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512BW-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermd %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %ymm8, (%rcx)
+; AVX512BW-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512BW-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512BW-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i32_stride6_vf8:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512BW-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512BW-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512BW-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm7
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512BW-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512BW-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i32_stride6_vf8:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512DQ-BW-NEXT:    vpermd %zmm6, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512DQ-BW-NEXT:    vpermd %zmm6, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm8, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%r9)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512DQ-BW-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm7
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512DQ-BW-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512DQ-BW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vpermd %ymm1, %ymm2, %ymm2
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512DQ-BW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm8, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf8:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm6, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512DQ-BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512DQ-BW-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm9, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <48 x i32>, ptr %in.vec, align 64
@@ -1953,161 +1987,166 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i32_stride6_vf16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX-NEXT:    vmovaps 224(%rdi), %ymm5
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm10
+; AVX-NEXT:    subq $296, %rsp # imm = 0x128
+; AVX-NEXT:    vmovaps 224(%rdi), %ymm10
 ; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 160(%rdi), %ymm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 128(%rdi), %ymm7
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm9
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm15
-; AVX-NEXT:    vmovaps 32(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps (%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm6[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,3]
-; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm15, %ymm8
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm9[2,0],ymm8[0,0],ymm9[6,4],ymm8[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm15[2,2],ymm4[6,4],ymm15[6,6]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT:    vmovapd %ymm7, %ymm1
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[3],ymm1[2]
-; AVX-NEXT:    vmovapd %ymm1, %ymm13
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm14
+; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 160(%rdi), %ymm4
+; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 128(%rdi), %ymm13
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm15
+; AVX-NEXT:    vmovaps 64(%rdi), %ymm12
+; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm3[0,2],xmm0[0,3]
-; AVX-NEXT:    vmovaps 288(%rdi), %ymm11
-; AVX-NEXT:    vmovaps 256(%rdi), %ymm10
-; AVX-NEXT:    vinsertf128 $1, 288(%rdi), %ymm10, %ymm5
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm11[2,0],ymm5[0,0],ymm11[6,4],ymm5[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm10[2,2],ymm14[6,4],ymm10[6,6]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX-NEXT:    vmovapd 352(%rdi), %ymm3
-; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 320(%rdi), %ymm12
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm3[0],ymm12[1],ymm3[3],ymm12[2]
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm14[6,7]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[1,3]
-; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm8[1,0],ymm9[7,4],ymm8[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm15[2,3],ymm4[6,4],ymm15[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovaps (%rdi), %xmm11
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm1[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,3]
+; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm12, %ymm8
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm15[2,0],ymm8[0,0],ymm15[6,4],ymm8[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm12[2,2],ymm5[6,4],ymm12[6,6]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],ymm4[0,1]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm7[0],ymm13[1],ymm7[3],ymm13[2]
+; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm9[6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm10[4,5],ymm14[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX-NEXT:    vmovaps 192(%rdi), %xmm5
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm6[0,2],xmm0[0,3]
+; AVX-NEXT:    vmovaps 288(%rdi), %ymm6
+; AVX-NEXT:    vmovaps 256(%rdi), %ymm14
+; AVX-NEXT:    vinsertf128 $1, 288(%rdi), %ymm14, %ymm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm6[2,0],ymm4[0,0],ymm6[6,4],ymm4[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm14[2,2],ymm10[6,4],ymm14[6,6]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm9[0,1,2],ymm10[3,4,5,6,7]
+; AVX-NEXT:    vmovapd 352(%rdi), %ymm2
+; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 320(%rdi), %ymm10
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[0,1]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm2[0],ymm10[1],ymm2[3],ymm10[2]
+; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[3,0],xmm11[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[2,0],xmm1[1,3]
+; AVX-NEXT:    vmovups %ymm15, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[3,0],ymm8[1,0],ymm15[7,4],ymm8[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm12[2,3],ymm3[6,4],ymm12[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
 ; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm13[1,3],ymm7[7,5],ymm13[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm5[1,0],ymm11[7,4],ymm5[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,3],ymm1[6,4],ymm10[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[3,1],ymm13[1,3],ymm7[7,5],ymm13[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,0],xmm5[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[1,3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm4[1,0],ymm6[7,4],ymm4[5,4]
+; AVX-NEXT:    vmovaps %ymm6, %ymm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,3],ymm1[6,4],ymm14[6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[3,1],ymm12[1,3],ymm3[7,5],ymm12[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[3,1],ymm10[1,3],ymm2[7,5],ymm10[5,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $12, (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm15[2,0],ymm9[6,5],ymm15[6,4]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm7[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm3[2,0],ymm4[4,4],ymm3[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm5[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[2,1],ymm10[2,0],ymm11[6,5],ymm10[6,4]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm14
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm14[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,0],xmm3[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm6[4,5],ymm13[6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm7[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[0,0],ymm7[2,0],ymm8[4,4],ymm7[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[3,1],ymm4[4,5],ymm3[7,5]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[3,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[3,1],ymm15[2,1],ymm1[7,5],ymm15[6,5]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm6[3,1],ymm0[4,5],ymm6[7,5]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm8[3,1],xmm14[3,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[3,1],ymm10[2,1],ymm11[7,5],ymm10[6,5]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vmovaps %ymm4, %ymm2
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm14[2,0],ymm4[6,5],ymm14[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm9
+; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm1[2,0],xmm9[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm15[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm11[0,0],ymm15[2,0],ymm11[4,4],ymm15[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[0,1],ymm7[3,1],ymm8[4,5],ymm7[7,5]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm3[3,3]
+; AVX-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm12[2,1],ymm4[7,5],ymm12[6,5]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5,6,7]
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm3
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,2,3,3]
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm5
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX-NEXT:    vmovapd 80(%rdi), %xmm6
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm6[1],ymm15[0],ymm6[2],ymm15[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,0],ymm1[4,5],ymm7[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm9[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[2,0],ymm7[0,0],ymm9[6,4],ymm7[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm7[0,2],ymm8[2,0],ymm7[4,6],ymm8[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[0,1],ymm15[3,1],ymm11[4,5],ymm15[7,5]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,1],xmm9[3,3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[3,1],ymm14[2,1],ymm2[7,5],ymm14[6,5]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7]
-; AVX-NEXT:    vmovaps 224(%rdi), %xmm12
-; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm12[2,2,3,3]
-; AVX-NEXT:    vmovaps 208(%rdi), %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm0[0],xmm14[1],xmm0[2,3]
-; AVX-NEXT:    vmovapd 272(%rdi), %xmm1
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm1[1],ymm10[0],ymm1[2],ymm10[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,0],ymm11[4,5],ymm4[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm8[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm14[0,0],ymm8[6,4],ymm14[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[0,2],ymm2[2,0],ymm14[4,6],ymm2[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm7[1,0],ymm9[7,4],ymm7[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[0,3],ymm4[2,0],ymm7[4,7],ymm4[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[3,1],ymm15[1,3],ymm6[7,5],ymm15[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[1,1],ymm5[2,0],ymm6[5,5],ymm5[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm14[1,0],ymm8[7,4],ymm14[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm14[0,3],ymm4[2,0],ymm14[4,7],ymm4[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm10[1,3],ymm1[7,5],ymm10[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[2,0],ymm11[5,5],ymm1[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm7 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vmovaps 32(%rdi), %xmm14
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[2,2,3,3]
+; AVX-NEXT:    vmovaps 16(%rdi), %xmm15
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm15[0],xmm0[1],xmm15[2,3]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm3
+; AVX-NEXT:    vmovaps 80(%rdi), %xmm2
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[2,0],xmm3[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm4[0,1],ymm8[2,0],ymm4[4,5],ymm8[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm7[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm7[2,0],ymm9[0,0],ymm7[6,4],ymm9[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[0,2],ymm8[2,0],ymm9[4,6],ymm8[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7]
+; AVX-NEXT:    vmovaps 224(%rdi), %xmm10
+; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm10[2,2,3,3]
+; AVX-NEXT:    vmovaps 208(%rdi), %xmm13
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3]
+; AVX-NEXT:    vmovaps 256(%rdi), %xmm6
+; AVX-NEXT:    vmovaps 272(%rdi), %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm0[2,0],xmm6[0,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[0,1],ymm12[2,0],ymm1[4,5],ymm12[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm5[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[2,0],ymm12[0,0],ymm5[6,4],ymm12[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm12[0,2],ymm4[2,0],ymm12[4,6],ymm4[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm9[1,0],ymm7[7,4],ymm9[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm9[0,3],ymm7[2,0],ymm9[4,7],ymm7[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm14[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[1,0]
+; AVX-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[1,1],ymm2[2,0],ymm3[5,5],ymm2[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm9[1,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm12[1,0],ymm5[7,4],ymm12[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[0,3],ymm3[2,0],ymm12[4,7],ymm3[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm10[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,0],xmm6[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,0],ymm1[5,5],ymm0[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm5[1,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm1, 32(%rsi)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -2118,19 +2157,18 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm1, (%rcx)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm1, 32(%r8)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm1, (%r8)
-; AVX-NEXT:    vmovaps %ymm2, 32(%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, (%r9)
+; AVX-NEXT:    vmovaps %ymm4, 32(%r9)
+; AVX-NEXT:    vmovaps %ymm8, (%r9)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX-NEXT:    vmovaps %ymm3, (%rax)
-; AVX-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX-NEXT:    vmovaps %ymm2, (%rax)
+; AVX-NEXT:    addq $296, %rsp # imm = 0x128
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -4000,387 +4038,399 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i32_stride6_vf32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $1048, %rsp # imm = 0x418
+; AVX-NEXT:    subq $1032, %rsp # imm = 0x408
 ; AVX-NEXT:    vmovaps 416(%rdi), %ymm7
 ; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 384(%rdi), %ymm9
-; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 160(%rdi), %ymm5
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 128(%rdi), %ymm6
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 32(%rdi), %ymm0
+; AVX-NEXT:    vmovaps 384(%rdi), %ymm8
+; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 160(%rdi), %ymm4
+; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 128(%rdi), %ymm5
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm12
+; AVX-NEXT:    vmovaps 64(%rdi), %ymm6
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps (%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm4
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm4[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,3]
-; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm3, %ymm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,3]
+; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm6, %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm1[0,0],ymm12[6,4],ymm1[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[2,2],ymm1[6,4],ymm6[6,6]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm5[0,1]
+; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm3
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm3[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,3]
-; AVX-NEXT:    vmovaps 480(%rdi), %ymm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vmovaps 384(%rdi), %xmm7
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm2[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3]
+; AVX-NEXT:    vmovaps 480(%rdi), %ymm6
+; AVX-NEXT:    vmovaps 448(%rdi), %ymm5
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 480(%rdi), %ymm5, %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 448(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 480(%rdi), %ymm2, %ymm9
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,2],ymm1[6,4],ymm2[6,6]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,2],ymm1[6,4],ymm5[6,6]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 544(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 512(%rdi), %ymm2
-; AVX-NEXT:    vmovupd %ymm2, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
+; AVX-NEXT:    vmovapd 512(%rdi), %ymm4
+; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[3],ymm4[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 224(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovaps 192(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
-; AVX-NEXT:    vmovaps 288(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 288(%rdi), %ymm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 256(%rdi), %ymm5
 ; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 288(%rdi), %ymm5, %ymm6
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm6[0,0],ymm2[6,4],ymm6[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,2],ymm2[6,4],ymm5[6,6]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT:    vmovapd 352(%rdi), %ymm2
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 320(%rdi), %ymm5
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[0,1]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT:    vinsertf128 $1, 288(%rdi), %ymm5, %ymm8
+; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm8[0,0],ymm4[6,4],ymm8[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm5[2,2],ymm4[6,4],ymm5[6,6]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT:    vmovapd 352(%rdi), %ymm4
+; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 320(%rdi), %ymm11
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm11[2,3],ymm4[0,1]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm14[0],ymm11[1],ymm14[3],ymm11[2]
+; AVX-NEXT:    vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 608(%rdi), %ymm0
+; AVX-NEXT:    vmovaps 608(%rdi), %ymm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 576(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 576(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm2[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm2[0,3]
-; AVX-NEXT:    vmovaps 672(%rdi), %ymm5
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX-NEXT:    vmovaps 576(%rdi), %xmm10
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm10[0,1],xmm5[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,3]
+; AVX-NEXT:    vmovaps 672(%rdi), %ymm8
+; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 640(%rdi), %ymm0
-; AVX-NEXT:    vinsertf128 $1, 672(%rdi), %ymm0, %ymm15
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[2,0],ymm15[0,0],ymm5[6,4],ymm15[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,2],ymm14[6,4],ymm0[6,6]
-; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm6[0,1,2],ymm14[3,4,5,6,7]
-; AVX-NEXT:    vmovapd 736(%rdi), %ymm0
+; AVX-NEXT:    vinsertf128 $1, 672(%rdi), %ymm0, %ymm9
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm8[2,0],ymm9[0,0],ymm8[6,4],ymm9[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm0[2,2],ymm15[6,4],ymm0[6,6]
+; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3,4,5,6,7]
+; AVX-NEXT:    vmovapd 736(%rdi), %ymm4
+; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 704(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 704(%rdi), %ymm5
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],ymm0[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm13 = ymm12[0],ymm5[1],ymm12[3],ymm5[2]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm4[0,1]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm13 = ymm8[0],ymm0[1],ymm8[3],ymm0[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[1,0],xmm4[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm8[0,2],xmm4[1,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm8 = ymm13[3,0],mem[1,0],ymm13[7,4],mem[5,4]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm14[2,3],ymm8[6,4],ymm14[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7]
+; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm13 = xmm3[3,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm13[2,0],xmm3[1,3]
+; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm13 = ymm12[3,0],mem[1,0],ymm12[7,4],mem[5,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm4[2,3],ymm13[6,4],ymm4[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm5[1,3],ymm0[7,5],ymm5[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm8[6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm11[1,0],xmm3[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[0,2],xmm3[1,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm9[1,0],ymm8[7,4],ymm9[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[3,1],ymm15[1,3],ymm0[7,5],ymm15[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[3,0],xmm7[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[1,3]
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm6[3,0],mem[1,0],ymm6[7,4],mem[5,4]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[2,3],ymm4[6,4],ymm6[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[3,1],ymm9[1,3],ymm0[7,5],ymm9[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm10[1,0],xmm1[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[0,2],xmm1[1,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm3 = ymm0[3,0],mem[1,0],ymm0[7,4],mem[5,4]
-; AVX-NEXT:    vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm3 = ymm3[2,0],mem[2,3],ymm3[6,4],mem[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm6[2,3],ymm3[6,4],ymm6[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm7[1,3],ymm0[7,5],ymm7[5,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm7[1,0],xmm2[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm2[1,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm1[3,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[1,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = ymm0[3,0],mem[1,0],ymm0[7,4],mem[5,4]
+; AVX-NEXT:    vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = ymm2[2,0],mem[2,3],ymm2[6,4],mem[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[3,1],ymm11[1,3],ymm14[7,5],ymm11[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm5[3,0],xmm10[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm5[1,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[3,0],ymm9[1,0],ymm14[7,4],ymm9[5,4]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[3,0],ymm15[1,0],ymm10[7,4],ymm15[5,4]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm11[2,3],ymm1[6,4],ymm11[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,3],ymm1[6,4],ymm10[6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm15[1,3],ymm12[7,5],ymm15[5,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[3,1],ymm13[1,3],ymm8[7,5],ymm13[5,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm4 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[2,1],ymm14[2,0],ymm13[6,5],ymm14[6,4]
+; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[2,1],ymm4[2,0],ymm12[6,5],ymm4[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm0[2,3]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm3 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm3[2,0],ymm0[4,4],ymm3[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[2,1],ymm6[2,0],ymm8[6,5],ymm6[6,4]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[2,1],ymm6[2,0],ymm0[6,5],ymm6[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm0[2,0],xmm1[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
-; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm6[0,0],ymm2[2,0],ymm6[4,4],ymm2[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm2[2,0],xmm0[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm9 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm9[2,0],ymm0[4,4],ymm9[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm6[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm10[2,1],ymm11[2,0],ymm10[6,5],ymm11[6,4]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm7[2,3,0,1]
-; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm0
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm11 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[2,1],ymm10[2,0],ymm14[6,5],ymm10[6,4]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,0,1]
+; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm5[2,0],xmm0[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm11[0,1,2],ymm10[3,4,5,6,7]
-; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm10 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm11[0,0],ymm10[2,0],ymm11[4,4],ymm10[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm11[2,0],xmm0[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
+; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm10[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm13[0,0],ymm10[2,0],ymm13[4,4],ymm10[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm15[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm9[2,1],ymm8[2,0],ymm9[6,5],ymm8[6,4]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm14
-; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm12[2,0],xmm14[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm5[2,1],ymm6[2,0],ymm5[6,5],ymm6[6,4]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,0,1]
+; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm8[2,0],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm3[3,1],ymm1[4,5],ymm3[7,5]
-; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm4[3,1],mem[3,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[3,1],ymm7[2,1],ymm13[7,5],ymm7[6,5]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm2[3,1],ymm6[4,5],ymm2[7,5]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm12[5,6,7]
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1],ymm3[3,1],ymm4[4,5],ymm3[7,5]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vshufps $247, (%rsp), %xmm3, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm3[3,1],mem[3,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm12 = ymm14[3,1],mem[2,1],ymm14[7,5],mem[6,5]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[0,1],ymm9[3,1],ymm3[4,5],ymm9[7,5]
 ; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm2 = xmm2[3,1],mem[3,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm6[2,1],ymm4[7,5],ymm6[6,5]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm7[3,1],mem[2,1],ymm7[7,5],mem[6,5]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm12[3,1],xmm14[3,3]
-; AVX-NEXT:    vmovaps %ymm9, %ymm3
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[3,1],ymm8[2,1],ymm9[7,5],ymm8[6,5]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm8[3,1],xmm1[3,3]
+; AVX-NEXT:    vmovaps %ymm5, %ymm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm6[2,1],ymm5[7,5],ymm6[6,5]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[0,1],ymm10[3,1],ymm11[4,5],ymm10[7,5]
-; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = xmm5[3,1],mem[3,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm10[3,1],ymm13[4,5],ymm10[7,5]
+; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm11[3,1],mem[3,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[3,1],ymm15[2,1],ymm11[7,5],ymm15[6,5]
+; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = ymm15[3,1],mem[2,1],ymm15[7,5],mem[6,5]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 416(%rdi), %xmm0
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX-NEXT:    vmovaps 400(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT:    vmovapd 464(%rdi), %xmm1
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4]
+; AVX-NEXT:    vmovaps 448(%rdi), %xmm3
+; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 464(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm9[0,0],ymm2[6,4],ymm9[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm1[2,0],ymm9[4,6],ymm1[6,4]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm3[0,0],ymm2[6,4],ymm3[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm14
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3]
-; AVX-NEXT:    vmovapd 80(%rdi), %xmm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm5[1],ymm7[0],ymm5[2],ymm7[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,0],ymm13[4,5],ymm1[6,4]
+; AVX-NEXT:    vmovaps 16(%rdi), %xmm13
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm12
+; AVX-NEXT:    vmovaps 80(%rdi), %xmm10
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm10[2,0],xmm12[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,0],ymm14[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm1[0,0],ymm4[6,4],ymm1[4,4]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[2,0],ymm1[0,0],ymm11[6,4],ymm1[4,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm2[2,0],ymm1[4,6],ymm2[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
 ; AVX-NEXT:    vmovaps 224(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX-NEXT:    vmovaps 208(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
-; AVX-NEXT:    vmovapd 272(%rdi), %xmm2
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4]
+; AVX-NEXT:    vmovaps 256(%rdi), %xmm5
+; AVX-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 272(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm6[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm2[0,0],ymm6[6,4],ymm2[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[0,2],ymm3[2,0],ymm2[4,6],ymm3[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm9[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm9[2,0],ymm2[0,0],ymm9[6,4],ymm2[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm2[0,2],ymm4[2,0],ymm2[4,6],ymm4[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
-; AVX-NEXT:    vmovaps 608(%rdi), %xmm13
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm13[2,2,3,3]
-; AVX-NEXT:    vmovaps 592(%rdi), %xmm8
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3]
-; AVX-NEXT:    vmovapd 656(%rdi), %xmm10
-; AVX-NEXT:    vmovaps %ymm15, %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm10[1],ymm15[0],ymm10[2],ymm15[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm11[0,1],ymm15[2,0],ymm11[4,5],ymm15[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm12[2,0],ymm0[0,0],ymm12[6,4],ymm0[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm11[2,0],ymm0[4,6],ymm11[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm4[3,0],ymm1[1,0],ymm4[7,4],ymm1[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm7[2,0],ymm1[4,7],ymm7[6,4]
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = xmm14[0,1],mem[2,3]
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[1,1],ymm5[2,0],ymm4[5,5],ymm5[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vmovaps 608(%rdi), %xmm8
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm8[2,2,3,3]
+; AVX-NEXT:    vmovaps 592(%rdi), %xmm6
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3]
+; AVX-NEXT:    vmovaps 640(%rdi), %xmm7
+; AVX-NEXT:    vmovaps 656(%rdi), %xmm5
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm5[2,0],xmm7[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,0],ymm15[4,5],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm0[0,0],ymm4[6,4],ymm0[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm0[0,2],ymm15[2,0],ymm0[4,6],ymm15[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm1[1,0],ymm11[7,4],ymm1[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm11[2,0],ymm1[4,7],ymm11[6,4]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = xmm13[0,1],mem[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[3,0],xmm12[1,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm9[1,0],ymm1[7,4],ymm9[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[0,3],ymm1[2,0],ymm9[4,7],ymm1[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vblendps $12, (%rsp), %xmm4, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm4[0,1],mem[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm9 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm4[1,1],ymm9[2,0],ymm4[5,5],ymm9[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm2[1,0],ymm6[7,4],ymm2[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[1,0],ymm1[7,4],ymm3[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,3],ymm1[2,0],ymm3[4,7],ymm1[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = xmm3[0,1],mem[2,3]
+; AVX-NEXT:    vmovaps (%rsp), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = xmm10[3,0],mem[1,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm10[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm2[1,0],ymm9[7,4],ymm2[5,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,3],ymm1[2,0],ymm2[4,7],ymm1[6,4]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm2 = xmm2[0,1],mem[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[1,1],ymm6[2,0],ymm4[5,5],ymm6[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm9 = xmm9[3,0],mem[1,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[1,1],ymm9[2,0],ymm10[5,5],ymm9[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[3,0],ymm0[1,0],ymm12[7,4],ymm0[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm0[1,0],ymm4[7,4],ymm0[5,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm13[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[3,1],ymm3[1,3],ymm10[7,5],ymm3[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[2,0],ymm4[5,5],ymm3[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm8[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[3,0],xmm7[1,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[1,1],ymm4[2,0],ymm5[5,5],ymm4[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm2, 96(%rsi)
@@ -4414,7 +4464,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %ymm2, 64(%r8)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm2, (%r8)
-; AVX-NEXT:    vmovaps %ymm11, 96(%r9)
+; AVX-NEXT:    vmovaps %ymm14, 96(%r9)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm2, 32(%r9)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -4424,9 +4474,9 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
-; AVX-NEXT:    vmovaps %ymm5, 64(%rax)
-; AVX-NEXT:    vmovaps %ymm7, (%rax)
-; AVX-NEXT:    addq $1048, %rsp # imm = 0x418
+; AVX-NEXT:    vmovaps %ymm3, 64(%rax)
+; AVX-NEXT:    vmovaps %ymm11, (%rax)
+; AVX-NEXT:    addq $1032, %rsp # imm = 0x408
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -8032,52 +8082,54 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i32_stride6_vf64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $2616, %rsp # imm = 0xA38
+; AVX-NEXT:    subq $2472, %rsp # imm = 0x9A8
 ; AVX-NEXT:    vmovaps 608(%rdi), %ymm6
 ; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 576(%rdi), %ymm7
 ; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 352(%rdi), %ymm4
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 320(%rdi), %ymm5
+; AVX-NEXT:    vmovapd 352(%rdi), %ymm3
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 320(%rdi), %ymm9
 ; AVX-NEXT:    vmovaps 288(%rdi), %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 256(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 224(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 256(%rdi), %ymm5
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm13
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm13[0,3]
-; AVX-NEXT:    vinsertf128 $1, 288(%rdi), %ymm3, %ymm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm10
+; AVX-NEXT:    vmovaps 192(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[0,3]
+; AVX-NEXT:    vinsertf128 $1, 288(%rdi), %ymm5, %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,2],ymm1[6,4],ymm5[6,6]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm3[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[3],ymm9[2]
+; AVX-NEXT:    vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm11
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[0,3]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm8
+; AVX-NEXT:    vmovaps 576(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,3]
 ; AVX-NEXT:    vmovaps 672(%rdi), %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 640(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 672(%rdi), %ymm3, %ymm1
+; AVX-NEXT:    vmovaps 640(%rdi), %ymm11
+; AVX-NEXT:    vinsertf128 $1, 672(%rdi), %ymm11, %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm11[2,2],ymm1[6,4],ymm11[6,6]
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 736(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8089,23 +8141,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 992(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 960(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 992(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX-NEXT:    vmovaps 960(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm8
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX-NEXT:    vmovaps 960(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,3]
 ; AVX-NEXT:    vmovaps 1056(%rdi), %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1024(%rdi), %ymm12
-; AVX-NEXT:    vinsertf128 $1, 1056(%rdi), %ymm12, %ymm1
+; AVX-NEXT:    vmovaps 1024(%rdi), %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 1056(%rdi), %ymm3, %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm12[2,2],ymm1[6,4],ymm12[6,6]
-; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1120(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8117,23 +8170,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1376(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1344(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 1376(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX-NEXT:    vmovaps 1344(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX-NEXT:    vmovaps 1344(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,3]
 ; AVX-NEXT:    vmovaps 1440(%rdi), %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1408(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 1440(%rdi), %ymm3, %ymm1
+; AVX-NEXT:    vmovaps 1408(%rdi), %ymm12
+; AVX-NEXT:    vinsertf128 $1, 1440(%rdi), %ymm12, %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm12[2,2],ymm1[6,4],ymm12[6,6]
+; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1504(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8145,23 +8199,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 32(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps (%rdi), %ymm1
+; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,3]
 ; AVX-NEXT:    vmovaps 96(%rdi), %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm3, %ymm1
+; AVX-NEXT:    vmovaps 64(%rdi), %ymm6
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 96(%rdi), %ymm6, %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[2,2],ymm1[6,4],ymm6[6,6]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 160(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8173,193 +8228,198 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 416(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 384(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 416(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX-NEXT:    vmovaps 384(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vmovaps 384(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3]
 ; AVX-NEXT:    vmovaps 480(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 448(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 480(%rdi), %ymm3, %ymm5
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[0,0],ymm1[6,4],ymm5[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT:    vmovaps 448(%rdi), %ymm6
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 480(%rdi), %ymm6, %ymm7
+; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[0,0],ymm1[6,4],ymm7[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[2,2],ymm1[6,4],ymm6[6,6]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 544(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 512(%rdi), %ymm3
-; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[0,1]
-; AVX-NEXT:    vmovupd %ymm1, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
+; AVX-NEXT:    vmovapd 512(%rdi), %ymm6
+; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[0,1]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[3],ymm6[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 800(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 768(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 800(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovaps 768(%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovaps 768(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
-; AVX-NEXT:    vmovaps 864(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 832(%rdi), %ymm5
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 864(%rdi), %ymm5, %ymm7
+; AVX-NEXT:    vmovaps 864(%rdi), %ymm6
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 832(%rdi), %ymm7
 ; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm7[0,0],ymm3[6,4],ymm7[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm5[2,2],ymm3[6,4],ymm5[6,6]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
-; AVX-NEXT:    vmovapd 928(%rdi), %ymm3
-; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 896(%rdi), %ymm5
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[0,1]
-; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT:    vinsertf128 $1, 864(%rdi), %ymm7, %ymm13
+; AVX-NEXT:    vmovups %ymm13, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm13[0,0],ymm6[6,4],ymm13[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm7[2,2],ymm6[6,4],ymm7[6,6]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
+; AVX-NEXT:    vmovapd 928(%rdi), %ymm6
+; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 896(%rdi), %ymm7
+; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[0,1]
+; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[3],ymm7[2]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1184(%rdi), %ymm0
+; AVX-NEXT:    vmovaps 1184(%rdi), %ymm6
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 1152(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1152(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm0[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,3]
-; AVX-NEXT:    vmovaps 1248(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vmovaps 1152(%rdi), %xmm6
+; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,3]
+; AVX-NEXT:    vmovaps 1248(%rdi), %ymm13
+; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 1216(%rdi), %ymm7
 ; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, 1248(%rdi), %ymm7, %ymm14
 ; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm3[2,0],ymm14[0,0],ymm3[6,4],ymm14[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm14[0,0],ymm13[6,4],ymm14[4,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6]
-; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm9[0,1,2],ymm15[3,4,5,6,7]
-; AVX-NEXT:    vmovapd 1312(%rdi), %ymm3
-; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1280(%rdi), %ymm7
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm7[2,3],ymm3[0,1]
-; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm3[0],ymm7[1],ymm3[3],ymm7[2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm6[0,1,2],ymm15[3,4,5,6,7]
+; AVX-NEXT:    vmovapd 1312(%rdi), %ymm6
+; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 1280(%rdi), %ymm13
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm6[0,1]
+; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm6[0],ymm13[1],ymm6[3],ymm13[2]
+; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm14[6,7]
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm3[1,0],xmm13[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm14[0,2],xmm13[1,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm14 = ymm3[3,0],mem[1,0],ymm3[7,4],mem[5,4]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm3[2,3],ymm14[6,4],ymm3[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
+; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm10[3,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm14[2,0],xmm10[1,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm14 = ymm15[3,0],mem[1,0],ymm15[7,4],mem[5,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm6[2,3],ymm14[6,4],ymm6[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm14 = ymm7[3,1],mem[1,3],ymm7[7,5],mem[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm7[3,1],ymm9[1,3],ymm7[7,5],ymm9[5,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm7[1,0],xmm11[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm13[0,2],xmm11[1,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = xmm8[3,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[1,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = ymm14[3,0],mem[1,0],ymm14[7,4],mem[5,4]
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm10 = ymm14[3,0],mem[1,0],ymm14[7,4],mem[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm11[2,3],ymm10[6,4],ymm11[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm7[2,3],ymm13[6,4],ymm7[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,1],ymm9[1,3],ymm13[7,5],ymm9[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[1,0],xmm8[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm11[0,2],xmm8[1,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm11 = ymm13[3,0],mem[1,0],ymm13[7,4],mem[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[2,0],ymm12[2,3],ymm11[6,4],ymm12[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[3,1],ymm12[1,3],ymm11[7,5],ymm12[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm9[3,1],ymm7[1,3],ymm9[7,5],ymm7[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
 ; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm8[0,2],xmm6[1,3]
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = xmm5[3,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm8[2,0],xmm5[1,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm8 = ymm10[3,0],mem[1,0],ymm10[7,4],mem[5,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm9[2,3],ymm8[6,4],ymm9[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm8 = ymm11[3,0],mem[1,0],ymm11[7,4],mem[5,4]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm15[2,3],ymm8[6,4],ymm15[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[3,1],ymm11[1,3],ymm8[7,5],ymm11[5,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm6[0,2],xmm4[1,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = xmm4[3,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[1,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = ymm8[3,0],mem[1,0],ymm8[7,4],mem[5,4]
-; AVX-NEXT:    vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = ymm6[2,0],mem[2,3],ymm6[6,4],mem[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = ymm8[3,0],mem[1,0],ymm8[7,4],mem[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm12[2,3],ymm5[6,4],ymm12[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[3,1],ymm12[1,3],ymm5[7,5],ymm12[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm4[0,2],xmm2[1,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm4 = ymm4[3,0],mem[1,0],ymm4[7,4],mem[5,4]
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm3[3,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[1,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm4 = ymm5[3,0],mem[1,0],ymm5[7,4],mem[5,4]
 ; AVX-NEXT:    vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm4 = ymm4[2,0],mem[2,3],ymm4[6,4],mem[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[3,1],ymm6[1,3],ymm4[7,5],ymm6[5,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm4 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = xmm2[3,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[1,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm3[3,0],mem[1,0],ymm3[7,4],mem[5,4]
+; AVX-NEXT:    vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm3[2,0],mem[2,3],ymm3[6,4],mem[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,1],ymm4[1,3],ymm3[7,5],ymm4[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm10[1,0],xmm1[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[1,3]
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm1[3,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[1,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    vshufps $19, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm2 = ymm2[3,0],mem[1,0],ymm2[7,4],mem[5,4]
 ; AVX-NEXT:    vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm2 = ymm2[2,0],mem[2,3],ymm2[6,4],mem[6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm4[1,3],ymm2[7,5],ymm4[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm3[1,3],ymm2[7,5],ymm3[5,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm5[1,0],xmm0[3,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3]
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm0[3,0],mem[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[1,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm1 = ymm1[3,0],mem[1,0],ymm1[7,4],mem[5,4]
@@ -8367,8 +8427,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    # ymm1 = ymm1[2,0],mem[2,3],ymm1[6,4],mem[6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm13[1,3],ymm1[7,5],ymm13[5,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8376,34 +8435,33 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm3[2,0],ymm0[6,5],ymm3[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[2,1],ymm6[2,0],ymm15[6,5],ymm6[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
+; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm15 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm15[2,3,0,1]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm15[2,0],ymm1[4,4],ymm15[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[2,1],ymm7[2,0],ymm14[6,5],ymm7[6,4]
+; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7]
+; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8414,18 +8472,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm0 = ymm13[2,1],mem[2,0],ymm13[6,5],mem[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm10[2,1],ymm9[2,0],ymm10[6,5],ymm9[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7]
+; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8433,26 +8490,26 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[2,1],ymm15[2,0],ymm11[6,5],ymm15[6,4]
+; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm10 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1]
+; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm11 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm10[2,0],ymm1[4,4],ymm10[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm11[2,0],ymm1[4,4],ymm11[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm0 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm5[2,1],mem[2,0],ymm5[6,5],mem[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8479,91 +8536,90 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm9 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1]
+; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm8 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm9[2,0],ymm1[4,4],ymm9[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm8[2,0],ymm1[4,4],ymm8[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm7 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm1
+; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm7[2,0],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm6[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm6[2,0],ymm13[4,4],ymm6[6,4]
+; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm4 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm4[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[0,0],ymm4[2,0],ymm9[4,4],ymm4[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[2,1],ymm3[2,0],ymm4[6,5],ymm3[6,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[2,1],ymm6[2,0],ymm3[6,5],ymm6[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1]
-; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm7
-; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm5[2,0],xmm7[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm1[2,0],xmm5[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm15 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm2[0,0],ymm15[2,0],ymm2[4,4],ymm15[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
+; AVX-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm13[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm2[0,0],ymm13[2,0],ymm2[4,4],ymm13[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = xmm10[3,1],mem[3,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm14 = ymm15[3,1],mem[2,1],ymm15[7,5],mem[6,5]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm11 = xmm1[3,1],mem[3,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = xmm10[3,1],mem[3,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[3,1],ymm14[2,1],ymm1[7,5],ymm14[6,5]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7]
+; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm14 = ymm14[3,1],mem[2,1],ymm14[7,5],mem[6,5]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm11 = xmm1[3,1],mem[3,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm11 = xmm0[3,1],mem[3,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = xmm10[3,1],mem[3,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm12 = ymm14[3,1],mem[2,1],ymm14[7,5],mem[6,5]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm10[3,1],ymm0[4,5],ymm10[7,5]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm10 = xmm1[3,1],mem[3,3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm11[3,1],ymm0[4,5],ymm11[7,5]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = xmm10[3,1],mem[3,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5]
@@ -8571,32 +8627,31 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm15[3,1],ymm2[4,5],ymm15[7,5]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm5[3,1],xmm7[3,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5]
-; AVX-NEXT:    vmovaps %ymm4, %ymm15
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm13[3,1],ymm2[4,5],ymm13[7,5]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[3,3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm6[3,1],ymm13[4,5],ymm6[7,5]
-; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = xmm8[3,1],mem[3,3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm4[3,1],ymm9[4,5],ymm4[7,5]
+; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm7[3,1],mem[3,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5]
+; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = ymm3[3,1],mem[2,1],ymm3[7,5],mem[6,5]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[3,1],ymm0[4,5],ymm9[7,5]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm8[3,1],ymm0[4,5],ymm8[7,5]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = xmm1[3,1],mem[3,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm8[2,1],ymm4[7,5],ymm8[6,5]
+; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = ymm4[3,1],mem[2,1],ymm4[7,5],mem[6,5]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
@@ -8604,34 +8659,35 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
-; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = xmm1[3,1],mem[3,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5]
+; AVX-NEXT:    vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = ymm5[3,1],mem[2,1],ymm5[7,5],mem[6,5]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT:    vmovapd 80(%rdi), %xmm1
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[3]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 80(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm2[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm11[0,0],ymm2[6,4],ymm11[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm1[2,0],ymm11[4,6],ymm1[6,4]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm9[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[2,0],ymm10[0,0],ymm9[6,4],ymm10[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm1[2,0],ymm10[4,6],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -8644,11 +8700,12 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 208(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT:    vmovapd 272(%rdi), %xmm1
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
+; AVX-NEXT:    vmovaps 256(%rdi), %xmm5
+; AVX-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 272(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4]
@@ -8665,9 +8722,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 400(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT:    vmovapd 464(%rdi), %xmm1
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[3]
+; AVX-NEXT:    vmovaps 448(%rdi), %xmm5
+; AVX-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 464(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1]
@@ -8684,10 +8743,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 592(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT:    vmovapd 656(%rdi), %xmm1
-; AVX-NEXT:    vmovupd %ymm1, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
+; AVX-NEXT:    vmovaps 640(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 656(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
@@ -8697,42 +8757,45 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
 ; AVX-NEXT:    vmovaps 800(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX-NEXT:    vmovaps 784(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT:    vmovapd 848(%rdi), %xmm1
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3]
+; AVX-NEXT:    vmovaps 832(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 848(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,0]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm10[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[2,0],ymm3[0,0],ymm10[6,4],ymm3[4,4]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[2,0],ymm3[0,0],ymm11[6,4],ymm3[4,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
 ; AVX-NEXT:    vmovaps 992(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX-NEXT:    vmovaps 976(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT:    vmovapd 1040(%rdi), %xmm1
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
+; AVX-NEXT:    vmovaps 1024(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps 1040(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,0],ymm14[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm9[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[2,0],ymm2[0,0],ymm9[6,4],ymm2[4,4]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm6[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[2,0],ymm2[0,0],ymm6[6,4],ymm2[4,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8745,15 +8808,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1168(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT:    vmovapd 1232(%rdi), %xmm1
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4]
+; AVX-NEXT:    vmovaps 1216(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 1232(%rdi), %xmm6
+; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm6[2,0],xmm1[0,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm13[0,0],ymm8[6,4],ymm13[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm13[0,2],ymm6[2,0],ymm13[4,6],ymm6[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -8762,125 +8827,126 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1376(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX-NEXT:    vmovaps 1360(%rdi), %xmm13
-; AVX-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
-; AVX-NEXT:    vmovapd 1424(%rdi), %xmm13
-; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm15 = ymm13[1],mem[0],ymm13[2],mem[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,0],ymm13[4,5],ymm15[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,2],ymm13[2,0],ymm0[4,6],ymm13[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm11[1,0],ymm13[7,4],ymm11[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,3],ymm13[2,0],ymm11[4,7],ymm13[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = xmm13[0,1],mem[2,3]
+; AVX-NEXT:    vmovaps 1360(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT:    vmovaps 1408(%rdi), %xmm14
+; AVX-NEXT:    vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 1424(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm0[2,0],xmm14[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm15 = ymm15[3,1],mem[1,3],ymm15[7,5],mem[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm7[1,0],ymm13[7,4],ymm7[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm13[2,0],ymm7[4,7],ymm13[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = xmm13[0,1],mem[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm15 = ymm14[3,1],mem[1,3],ymm14[7,5],mem[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm7[5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm5[1,0],ymm13[7,4],ymm5[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm13[2,0],ymm5[4,7],ymm13[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = xmm13[0,1],mem[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm15 = ymm14[3,1],mem[1,3],ymm14[7,5],mem[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,0],ymm15[4,5],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm0[0,2],ymm15[2,0],ymm0[4,6],ymm15[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
+; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm9[3,0],ymm10[1,0],ymm9[7,4],ymm10[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,3],ymm14[2,0],ymm10[4,7],ymm14[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm10[0,1],mem[2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = xmm10[3,0],mem[1,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[1,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm10[3,0],ymm7[1,0],ymm10[7,4],ymm7[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm14[2,0],ymm7[4,7],ymm14[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm10[0,1],mem[2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = xmm10[3,0],mem[1,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[1,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm10[3,0],ymm5[1,0],ymm10[7,4],ymm5[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm14[2,0],ymm5[4,7],ymm14[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm10[0,1],mem[2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = xmm10[3,0],mem[1,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[1,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5,6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm12[2,0],ymm4[4,7],ymm12[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm12[0,1],mem[2,3]
-; AVX-NEXT:    vmovups (%rsp), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = ymm13[3,1],mem[1,3],ymm13[7,5],mem[5,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm12 = xmm10[0,1],mem[2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm10[3,0],mem[1,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm15[1,1],ymm13[2,0],ymm15[5,5],ymm13[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[1,1],ymm14[2,0],ymm15[5,5],ymm14[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[3,0],ymm3[1,0],ymm10[7,4],ymm3[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm10[2,0],ymm3[4,7],ymm10[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm11[2,0],ymm3[4,7],ymm11[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = xmm10[0,1],mem[2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm12 = xmm10[3,0],mem[1,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm10[1,1],ymm12[2,0],ymm10[5,5],ymm12[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm1[3,0],ymm2[1,0],ymm1[7,4],ymm2[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm10[2,0],ymm2[4,7],ymm10[6,4]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm10 = xmm10[0,1],mem[2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT:    vshufps $19, (%rsp), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = xmm11[3,0],mem[1,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[1,3],ymm12[7,5],mem[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm13[1,1],ymm12[2,0],ymm13[5,5],ymm12[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm12[1,1],ymm11[2,0],ymm12[5,5],ymm11[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4],ymm3[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm2[1,0],ymm9[7,4],ymm2[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm9[2,0],ymm2[4,7],ymm9[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = xmm9[0,1],mem[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5,6,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm1[1,0],ymm8[7,4],ymm1[5,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm8[2,0],ymm1[4,7],ymm8[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm13[1,0],ymm8[7,4],ymm13[5,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,3],ymm8[2,0],ymm13[4,7],ymm8[6,4]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm8 = xmm8[0,1],mem[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm9 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[1,1],ymm9[2,0],ymm10[5,5],ymm9[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = xmm10[3,0],mem[1,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm11[1,1],ymm10[2,0],ymm11[5,5],ymm10[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm0[1,0],ymm6[7,4],ymm0[5,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm6[2,0],ymm0[4,7],ymm6[6,4]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm6 = xmm6[0,1],mem[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm8[2,0],ymm9[5,5],ymm8[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = xmm8[3,0],mem[1,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[1,1],ymm8[2,0],ymm10[5,5],ymm8[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7]
@@ -8972,8 +9038,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %ymm4, 96(%rax)
 ; AVX-NEXT:    vmovaps %ymm5, 64(%rax)
 ; AVX-NEXT:    vmovaps %ymm7, 32(%rax)
-; AVX-NEXT:    vmovaps %ymm11, (%rax)
-; AVX-NEXT:    addq $2616, %rsp # imm = 0xA38
+; AVX-NEXT:    vmovaps %ymm9, (%rax)
+; AVX-NEXT:    addq $2472, %rsp # imm = 0x9A8
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index 7948141f6becd..16b50a5da694a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -173,9 +173,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm4 = [4,3,0,0]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm7 = [5,4,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm7, %ymm7
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
@@ -193,33 +193,30 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512-NEXT:    vmovd %xmm1, %r11d
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512-NEXT:    vpermps (%rdi), %zmm1, %zmm1
-; AVX512-NEXT:    vmovaps (%rdi), %ymm5
-; AVX512-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX512-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX512-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512-NEXT:    vmovq %xmm0, (%r8)
-; AVX512-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512-NEXT:    vmovlps %xmm7, (%r10)
-; AVX512-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512-NEXT:    vmovd %xmm2, %r11d
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512-NEXT:    vpinsrd $1, %r11d, %xmm4, %xmm4
+; AVX512-NEXT:    vpbroadcastd 8(%rdi), %xmm5
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,11,0,0]
+; AVX512-NEXT:    vpermps %zmm0, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,14,15]
+; AVX512-NEXT:    vpermps %zmm0, %zmm6, %zmm6
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [6,13,6,7]
+; AVX512-NEXT:    vpermps %zmm0, %zmm7, %zmm0
+; AVX512-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512-NEXT:    vmovq %xmm1, (%r8)
+; AVX512-NEXT:    vmovlps %xmm2, (%r9)
+; AVX512-NEXT:    vmovlps %xmm6, (%r10)
+; AVX512-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -257,33 +254,30 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512DQ-NEXT:    vmovd %xmm1, %r11d
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512DQ-NEXT:    vpermps (%rdi), %zmm1, %zmm1
-; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm5
-; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512DQ-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512DQ-NEXT:    vmovlps %xmm7, (%r10)
-; AVX512DQ-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512DQ-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512DQ-NEXT:    vmovd %xmm2, %r11d
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512DQ-NEXT:    vpinsrd $1, %r11d, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpbroadcastd 8(%rdi), %xmm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,11,0,0]
+; AVX512DQ-NEXT:    vpermps %zmm0, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,14,15]
+; AVX512DQ-NEXT:    vpermps %zmm0, %zmm6, %zmm6
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [6,13,6,7]
+; AVX512DQ-NEXT:    vpermps %zmm0, %zmm7, %zmm0
+; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512DQ-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512DQ-NEXT:    vmovq %xmm1, (%r8)
+; AVX512DQ-NEXT:    vmovlps %xmm2, (%r9)
+; AVX512DQ-NEXT:    vmovlps %xmm6, (%r10)
+; AVX512DQ-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -321,33 +315,30 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512BW-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512BW-NEXT:    vmovd %xmm1, %r11d
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512BW-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512BW-NEXT:    vpermps (%rdi), %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovaps (%rdi), %ymm5
-; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX512BW-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512BW-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX512BW-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512BW-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
-; AVX512BW-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512BW-NEXT:    vmovlps %xmm7, (%r10)
-; AVX512BW-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512BW-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512BW-NEXT:    vmovd %xmm2, %r11d
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512BW-NEXT:    vpinsrd $1, %r11d, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpbroadcastd 8(%rdi), %xmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,11,0,0]
+; AVX512BW-NEXT:    vpermps %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,14,15]
+; AVX512BW-NEXT:    vpermps %zmm0, %zmm6, %zmm6
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [6,13,6,7]
+; AVX512BW-NEXT:    vpermps %zmm0, %zmm7, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512BW-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512BW-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512BW-NEXT:    vmovq %xmm1, (%r8)
+; AVX512BW-NEXT:    vmovlps %xmm2, (%r9)
+; AVX512BW-NEXT:    vmovlps %xmm6, (%r10)
+; AVX512BW-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -385,33 +376,30 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-BW-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512DQ-BW-NEXT:    vmovd %xmm1, %r11d
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512DQ-BW-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512DQ-BW-NEXT:    vpermps (%rdi), %zmm1, %zmm1
-; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm5
-; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm7, (%r10)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512DQ-BW-NEXT:    vmovd %xmm2, %r11d
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpinsrd $1, %r11d, %xmm4, %xmm4
+; AVX512DQ-BW-NEXT:    vpbroadcastd 8(%rdi), %xmm5
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,11,0,0]
+; AVX512DQ-BW-NEXT:    vpermps %zmm0, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,12,14,15]
+; AVX512DQ-BW-NEXT:    vpermps %zmm0, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [6,13,6,7]
+; AVX512DQ-BW-NEXT:    vpermps %zmm0, %zmm7, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%r8)
+; AVX512DQ-BW-NEXT:    vmovlps %xmm2, (%r9)
+; AVX512DQ-BW-NEXT:    vmovlps %xmm6, (%r10)
+; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -535,17 +523,18 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
 ; AVX-NEXT:    vmovaps 64(%rdi), %xmm5
 ; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3]
 ; AVX-NEXT:    vmovaps 80(%rdi), %xmm6
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0],mem[3,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[2]
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm5[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[1,0],mem[3,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[2]
+; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm5[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
 ; AVX-NEXT:    vmovaps 96(%rdi), %xmm9
 ; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm9[0],xmm5[1],xmm9[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
@@ -560,14 +549,13 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4]
 ; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
 ; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm10[2,0],xmm5[3,2]
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,3]
 ; AVX-NEXT:    vmovaps %xmm2, (%rsi)
-; AVX-NEXT:    vmovaps %xmm7, (%rdx)
-; AVX-NEXT:    vmovaps %xmm8, (%rcx)
+; AVX-NEXT:    vmovaps %xmm8, (%rdx)
+; AVX-NEXT:    vmovaps %xmm6, (%rcx)
 ; AVX-NEXT:    vmovaps %xmm3, (%r8)
 ; AVX-NEXT:    vmovaps %xmm4, (%r9)
 ; AVX-NEXT:    vmovaps %xmm5, (%r10)
@@ -1148,8 +1136,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-LABEL: load_i32_stride7_vf8:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps 160(%rdi), %ymm4
-; AVX-NEXT:    vmovaps 128(%rdi), %ymm7
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm10
+; AVX-NEXT:    vmovaps 128(%rdi), %ymm8
 ; AVX-NEXT:    vmovaps 32(%rdi), %ymm0
 ; AVX-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX-NEXT:    vmovaps 96(%rdi), %ymm12
@@ -1157,86 +1144,87 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX-NEXT:    vmovaps (%rdi), %xmm14
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm9
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
+; AVX-NEXT:    vmovaps (%rdi), %xmm13
+; AVX-NEXT:    vmovaps 32(%rdi), %xmm10
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm7
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm5[0,1,2],ymm3[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 160(%rdi), %xmm3
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm5
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm5[1],xmm3[1]
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm5[1],xmm3[1]
 ; AVX-NEXT:    vmovaps 192(%rdi), %xmm11
-; AVX-NEXT:    vinsertps {{.*#+}} xmm8 = zero,xmm8[1,2],xmm11[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm12[1,1],ymm10[2,2],ymm12[5,5],ymm10[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm9[0],xmm14[1],xmm9[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4,5,6,7]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm7[2,3],ymm4[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm7[0,0],ymm13[3,3],ymm7[4,4],ymm13[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm13, %xmm13
-; AVX-NEXT:    vinsertps {{.*#+}} xmm13 = zero,xmm13[1,2],xmm11[2]
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm14[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm13[0],xmm9[1],xmm13[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm10[0,3],ymm2[7,5],ymm10[4,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
-; AVX-NEXT:    vextractf128 $1, %ymm13, %xmm13
-; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm13
-; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT:    vmovaps 64(%rdi), %xmm15
-; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm12[0,0],ymm10[5,4],ymm12[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6]
-; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm9[0,1,2],xmm14[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm13[0,1],ymm4[1,3],ymm13[4,5],ymm4[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm7[0,2],ymm12[2,0],ymm7[4,6],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm13[1,0],ymm4[2,0],ymm13[5,4],ymm4[6,4]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm7[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm14[0,0],ymm7[7,4],ymm14[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm12[2,0],ymm7[6,4],ymm12[6,4]
-; AVX-NEXT:    vmovaps 96(%rdi), %xmm12
-; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm12[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[2,1],ymm4[3,3],ymm13[6,5],ymm4[7,7]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm3[0],xmm5[1],xmm3[2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm9 = zero,xmm9[1,2],xmm11[1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm15[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm12[1,1],mem[2,2],ymm12[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm10[0],xmm13[1],xmm10[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4,5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm4[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm8[0,0],ymm14[3,3],ymm8[4,4],ymm14[7,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm14[2,0],xmm9[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm11[2]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm13[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm14[0],xmm10[1],xmm14[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm2[3,0],xmm7[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm14
+; AVX-NEXT:    vmovaps 96(%rdi), %xmm15
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[1,0],xmm15[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm14[0,2],ymm12[7,5],ymm14[4,6]
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm14
+; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm10[0,1,2],xmm13[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm14[0,1],ymm4[1,3],ymm14[4,5],ymm4[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm8[0,2],ymm13[2,0],ymm8[4,6],ymm13[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm13[0,0],ymm8[7,4],ymm13[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm4[2,0],ymm14[5,4],ymm4[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm13[2,0],ymm8[6,4],ymm13[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm15[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm7[0,1,2],xmm13[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm14[2,1],ymm4[3,3],ymm14[6,5],ymm4[7,7]
+; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm3[0],xmm5[1],xmm3[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm4[2,0],ymm10[5,4],ymm4[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
+; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm10[2,0],xmm7[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm15[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm13[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[3,0],ymm1[0,0],ymm13[7,4],ymm1[4,4]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm14[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[3,0],ymm1[0,0],ymm14[7,4],ymm1[4,4]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX-NEXT:    vmovaps %ymm8, (%rdx)
+; AVX-NEXT:    vmovaps %ymm9, (%rdx)
 ; AVX-NEXT:    vmovaps %ymm11, (%rcx)
-; AVX-NEXT:    vmovaps %ymm10, (%r8)
-; AVX-NEXT:    vmovaps %ymm7, (%r9)
+; AVX-NEXT:    vmovaps %ymm12, (%r8)
+; AVX-NEXT:    vmovaps %ymm8, (%r9)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    vmovaps %ymm4, (%rax)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -2326,234 +2314,233 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-LABEL: load_i32_stride7_vf16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    subq $456, %rsp # imm = 0x1C8
-; AVX-NEXT:    vmovaps 32(%rdi), %ymm4
+; AVX-NEXT:    vmovaps 256(%rdi), %ymm4
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps (%rdi), %ymm6
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm15
-; AVX-NEXT:    vmovaps 256(%rdi), %ymm2
+; AVX-NEXT:    vmovaps 224(%rdi), %ymm5
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 320(%rdi), %ymm15
+; AVX-NEXT:    vmovaps 32(%rdi), %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
+; AVX-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 320(%rdi), %ymm5
-; AVX-NEXT:    vmovaps 304(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm10
+; AVX-NEXT:    vmovaps 80(%rdi), %xmm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmovaps 224(%rdi), %xmm13
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovaps (%rdi), %xmm9
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 384(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 160(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 128(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX-NEXT:    vmovaps 416(%rdi), %xmm12
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm12[1]
+; AVX-NEXT:    vmovaps 192(%rdi), %xmm3
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm3[1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 80(%rdi), %xmm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7]
+; AVX-NEXT:    vmovaps 304(%rdi), %xmm13
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
+; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmovaps (%rdi), %xmm9
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovaps 224(%rdi), %xmm7
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 160(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 384(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 128(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX-NEXT:    vmovaps 192(%rdi), %xmm8
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1]
+; AVX-NEXT:    vmovaps 416(%rdi), %xmm2
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 288(%rdi), %ymm6
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm6[2,2],ymm5[5,5],ymm6[6,6]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm10[1,1],mem[2,2],ymm10[5,5],mem[6,6]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT:    vmovaps 256(%rdi), %xmm11
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0],xmm13[1],xmm11[2,3]
+; AVX-NEXT:    vmovaps 32(%rdi), %xmm8
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0],xmm9[1],xmm8[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 384(%rdi), %ymm7
-; AVX-NEXT:    vmovaps 352(%rdi), %ymm1
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm7[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT:    vmovaps 160(%rdi), %ymm5
+; AVX-NEXT:    vmovaps 128(%rdi), %ymm1
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm5[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm4[3,3],ymm1[4,4],ymm4[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT:    vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm3[2]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm3
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm3[2,2],ymm15[5,5],ymm3[6,6]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[1,1],mem[2,2],ymm15[5,5],mem[6,6]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm10
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3]
+; AVX-NEXT:    vmovaps 256(%rdi), %xmm6
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0],xmm7[1],xmm6[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 160(%rdi), %ymm4
-; AVX-NEXT:    vmovaps 128(%rdi), %ymm0
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm4[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[3,3],ymm0[4,4],ymm14[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm8[2]
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm13[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm4[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 384(%rdi), %ymm4
+; AVX-NEXT:    vmovaps 352(%rdi), %ymm0
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm4[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,0],ymm12[3,3],ymm0[4,4],ymm12[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
+; AVX-NEXT:    vinsertps {{.*#+}} xmm12 = zero,xmm12[1,2],xmm2[2]
+; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7]
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm9[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0],xmm8[1],xmm11[2,3]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm12
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm6[0,3],ymm14[7,5],ymm6[4,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[2,1],ymm14[2,0],ymm5[6,5],ymm14[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm7[0],ymm1[2],ymm7[2]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm12[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm10[2,1],ymm14[2,0],ymm10[6,5],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
 ; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm7[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
+; AVX-NEXT:    vmovaps 288(%rdi), %xmm11
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm13[3,0],xmm11[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[2,1],ymm14[2,0],ymm15[6,5],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
+; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 96(%rdi), %xmm3
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[3,1],ymm2[0,2],ymm10[7,5],ymm2[4,6]
+; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm9
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm9[0,1],ymm5[1,3],ymm9[4,5],ymm5[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm1[0,2],ymm10[2,0],ymm1[4,6],ymm10[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm5[0,0],ymm6[5,4],ymm5[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm2[0,2],ymm5[7,5],ymm2[4,6]
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm13[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 416(%rdi), %ymm2
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm2[0,1],ymm7[1,3],ymm2[4,5],ymm7[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm15[0,0],ymm3[5,4],ymm15[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[3,1],ymm3[0,2],ymm15[7,5],ymm3[4,6]
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm9[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm6
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm4[1,3],ymm6[4,5],ymm4[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm5[2,0],ymm0[4,6],ymm5[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7]
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4]
-; AVX-NEXT:    vmovaps 320(%rdi), %xmm5
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm5[0,1,0,1]
-; AVX-NEXT:    vmovaps 288(%rdi), %xmm8
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm11[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4]
-; AVX-NEXT:    vmovaps 64(%rdi), %xmm11
-; AVX-NEXT:    vmovaps 96(%rdi), %xmm9
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[2,1],ymm7[3,3],ymm2[6,5],ymm7[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT:    vmovaps 288(%rdi), %xmm10
+; AVX-NEXT:    vmovaps 320(%rdi), %xmm2
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[1,0],xmm2[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm15[3,1],ymm10[0,2],ymm15[7,5],ymm10[4,6]
+; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 416(%rdi), %ymm15
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm15[0,1],ymm4[1,3],ymm15[4,5],ymm4[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[0,2],ymm10[2,0],ymm0[4,6],ymm10[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
+; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[0,0],ymm1[7,4],ymm10[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm9[1,0],ymm5[2,0],ymm9[5,4],ymm5[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,0],ymm1[6,4],ymm10[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm3[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = mem[0],xmm8[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[0,0],ymm0[7,4],ymm8[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm15[1,0],ymm4[2,0],ymm15[5,4],ymm4[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,0],ymm0[6,4],ymm8[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm9[2,1],ymm5[3,3],ymm9[6,5],ymm5[7,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm14[0],xmm0[1],xmm14[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm8[3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[2,0],ymm6[5,4],ymm5[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm12[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm14[0,0],ymm12[1,0],ymm14[4,4],ymm12[5,4]
 ; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[2,1],ymm4[3,3],ymm6[6,5],ymm4[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm4[2,0],ymm7[5,4],ymm4[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm11[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm15[2,1],ymm4[3,3],ymm15[6,5],ymm4[7,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0],xmm5[1],xmm7[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm2[0,1,2],xmm11[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm11[1,0],ymm10[4,4],ymm11[5,4]
 ; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm7[0,0],ymm2[7,4],ymm7[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,0],ymm7[4,5],ymm2[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[0,0],ymm6[7,4],ymm7[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm1[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm3[1],xmm7[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm14[1,0],ymm12[2,0],ymm14[5,4],ymm12[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm9[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm9[3,0],ymm6[0,0],ymm9[7,4],ymm6[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0],xmm1[1],xmm8[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,0],ymm8[4,5],ymm6[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm6[2,0],xmm2[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm15[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm15[3,0],ymm6[0,0],ymm15[7,4],ymm6[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm5[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,0],ymm8[4,5],ymm6[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, (%rsi)
+; AVX-NEXT:    vmovaps %ymm0, 32(%rsi)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 32(%rsi)
+; AVX-NEXT:    vmovaps %ymm6, (%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm0, 32(%rdx)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, (%rdx)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 32(%rdx)
+; AVX-NEXT:    vmovaps %ymm0, 32(%rcx)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, (%rcx)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 32(%rcx)
+; AVX-NEXT:    vmovaps %ymm0, 32(%r8)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, (%r8)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 32(%r8)
+; AVX-NEXT:    vmovaps %ymm0, 32(%r9)
 ; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, (%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 32(%r9)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovaps %ymm4, (%rax)
-; AVX-NEXT:    vmovaps %ymm15, 32(%rax)
+; AVX-NEXT:    vmovaps %ymm4, 32(%rax)
+; AVX-NEXT:    vmovaps %ymm13, (%rax)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovaps %ymm5, (%rax)
 ; AVX-NEXT:    vmovaps %ymm2, 32(%rax)
+; AVX-NEXT:    vmovaps %ymm3, (%rax)
 ; AVX-NEXT:    addq $456, %rsp # imm = 0x1C8
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -4881,560 +4868,561 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i32_stride7_vf32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $1432, %rsp # imm = 0x598
+; AVX-NEXT:    subq $1512, %rsp # imm = 0x5E8
 ; AVX-NEXT:    vmovaps 480(%rdi), %ymm4
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 448(%rdi), %ymm3
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 544(%rdi), %ymm5
+; AVX-NEXT:    vmovaps 544(%rdi), %ymm6
 ; AVX-NEXT:    vmovaps 32(%rdi), %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm12
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm11
 ; AVX-NEXT:    vmovaps 80(%rdi), %xmm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2]
-; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmovaps (%rdi), %xmm8
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
-; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps (%rdi), %xmm13
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 160(%rdi), %xmm2
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 160(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 128(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX-NEXT:    vmovaps 192(%rdi), %xmm12
+; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm12[1]
+; AVX-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 528(%rdi), %xmm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT:    vmovaps 448(%rdi), %xmm8
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 608(%rdi), %xmm3
+; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 576(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 128(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX-NEXT:    vmovaps 192(%rdi), %xmm7
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[1]
-; AVX-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 528(%rdi), %xmm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX-NEXT:    vmovaps 640(%rdi), %xmm5
+; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm5[1]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 256(%rdi), %ymm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmovaps 448(%rdi), %xmm10
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovaps 224(%rdi), %xmm14
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 608(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX-NEXT:    vmovaps 304(%rdi), %xmm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2]
+; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 384(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 576(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX-NEXT:    vmovaps 640(%rdi), %xmm9
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[1]
+; AVX-NEXT:    vmovaps 352(%rdi), %xmm3
+; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; AVX-NEXT:    vmovaps 416(%rdi), %xmm9
+; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm9[1]
 ; AVX-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 256(%rdi), %ymm1
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 704(%rdi), %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 672(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 224(%rdi), %ymm0
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7]
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX-NEXT:    vmovaps 672(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX-NEXT:    vmovaps 768(%rdi), %ymm15
+; AVX-NEXT:    vmovaps 752(%rdi), %xmm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovaps 224(%rdi), %xmm11
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 832(%rdi), %xmm4
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 800(%rdi), %xmm3
+; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; AVX-NEXT:    vmovaps 864(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm2[1]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[1,1],mem[2,2],ymm11[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX-NEXT:    vmovaps 32(%rdi), %xmm11
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm11[0],xmm13[1],xmm11[2,3]
+; AVX-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT:    vmovaps 320(%rdi), %ymm4
-; AVX-NEXT:    vmovaps 304(%rdi), %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 160(%rdi), %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 128(%rdi), %ymm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm3[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[0,0],ymm3[3,3],ymm2[4,4],ymm3[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 384(%rdi), %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[1,1],mem[2,2],ymm6[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX-NEXT:    vmovaps 480(%rdi), %xmm7
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm7[0],xmm8[1],xmm7[2,3]
+; AVX-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm8, %xmm2
+; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 608(%rdi), %ymm12
+; AVX-NEXT:    vmovaps 576(%rdi), %ymm6
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm12[0,1]
+; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[0,0],ymm4[3,3],ymm6[4,4],ymm4[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT:    vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm5[2]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[1,1],mem[2,2],ymm10[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT:    vmovaps 256(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 352(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT:    vmovaps 416(%rdi), %xmm3
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm3[1]
-; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 704(%rdi), %ymm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0],xmm14[1],xmm1[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 384(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 672(%rdi), %ymm0
+; AVX-NEXT:    vmovaps 352(%rdi), %ymm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm4[2,3],ymm1[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm4[0,0],ymm8[3,3],ymm4[4,4],ymm8[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
+; AVX-NEXT:    vinsertps {{.*#+}} xmm8 = zero,xmm8[1,2],xmm9[2]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[1,1],mem[2,2],ymm15[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT:    vmovaps 704(%rdi), %xmm1
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm1[0],xmm14[1],xmm1[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 832(%rdi), %ymm10
+; AVX-NEXT:    vmovaps 800(%rdi), %ymm3
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],ymm10[0,1]
+; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm3[0,0],ymm9[3,3],ymm3[4,4],ymm9[7,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm9
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm9 = zero,xmm9[1,2],xmm4[2]
+; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7]
+; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm13[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[3,0],xmm0[0,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm0[2,1],ymm9[2,0],ymm0[6,5],ymm9[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
+; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm9
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm9[0,1,2],mem[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovaps 672(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT:    vmovaps 768(%rdi), %ymm14
-; AVX-NEXT:    vmovaps 752(%rdi), %xmm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 832(%rdi), %xmm2
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
+; AVX-NEXT:    vmovaps 512(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 800(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX-NEXT:    vmovaps 864(%rdi), %xmm6
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm6[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[3,0],xmm2[0,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm2[2,1],ymm8[2,0],ymm2[6,5],ymm8[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm12[0],ymm6[2],ymm12[2]
+; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm12[1,1],ymm0[2,2],ymm12[5,5],ymm0[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX-NEXT:    vmovaps %xmm1, %xmm14
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 160(%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 128(%rdi), %ymm15
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[0,0],ymm1[3,3],ymm15[4,4],ymm1[7,7]
-; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[2]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 512(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6]
-; AVX-NEXT:    vmovaps %ymm5, %ymm7
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT:    vmovaps 480(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 736(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm10[1],xmm1[2,3]
-; AVX-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 608(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 576(%rdi), %ymm12
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm0[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm12[0,0],ymm5[3,3],ymm12[4,4],ymm5[7,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm1[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm15[2,1],ymm5[2,0],ymm15[6,5],ymm5[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
 ; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm9[2]
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm4[3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 288(%rdi), %ymm0
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX-NEXT:    vmovaps 256(%rdi), %xmm0
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm0[0],xmm11[1],xmm0[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 384(%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 352(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm5[3,3],ymm0[4,4],ymm5[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm3[2]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 736(%rdi), %ymm5
-; AVX-NEXT:    vmovaps %ymm14, %ymm3
-; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[1,1],ymm5[2,2],ymm14[5,5],ymm5[6,6]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX-NEXT:    vmovaps 704(%rdi), %xmm4
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 832(%rdi), %ymm13
-; AVX-NEXT:    vmovaps 800(%rdi), %ymm2
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],ymm13[0,1]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm2[0,0],ymm14[3,3],ymm2[4,4],ymm14[7,7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm6[2]
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
-; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm11 = mem[2,3,2,3]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0],xmm0[1],xmm11[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm9[0,3],ymm14[7,5],ymm9[4,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm8[2,1],ymm14[2,0],ymm8[6,5],ymm14[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
-; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm14 = ymm15[0],mem[0],ymm15[2],mem[2]
-; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm14 = xmm14[0,1,2],mem[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
-; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm10[2,3,2,3]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm10[0,3],ymm14[7,5],ymm10[4,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
-; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm14 = ymm12[0],mem[0],ymm12[2],mem[2]
-; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm14 = xmm14[0,1,2],mem[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
-; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm1[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0],xmm4[1],xmm11[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm3[2,1],ymm14[2,0],ymm3[6,5],ymm14[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm13[0],ymm2[2],ymm13[2]
-; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm6[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm6[2,3,2,3]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[3,1],ymm3[0,3],ymm11[7,5],ymm3[4,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm2[2,1],ymm11[2,0],ymm2[6,5],ymm11[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
+; AVX-NEXT:    vmovaps 288(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm5[0],ymm13[2],ymm5[2]
-; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm11
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm11 = xmm11[0,1,2],mem[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm8[0,0],ymm9[5,4],ymm8[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[3,1],ymm4[0,2],ymm8[7,5],ymm4[4,6]
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = xmm0[0,1,2],mem[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[0,1],ymm14[1,3],ymm0[4,5],ymm14[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm11[2,0],ymm0[4,6],ymm11[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm7[0,0],ymm10[5,4],ymm7[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm4[0,2],ymm7[7,5],ymm4[4,6]
-; AVX-NEXT:    vmovaps %xmm15, %xmm10
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = xmm15[0,1,2],mem[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 640(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm1[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[0,1],ymm4[1,3],ymm0[4,5],ymm4[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[2,0],ymm12[4,6],ymm11[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[0,0],ymm3[5,4],ymm2[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm2[3,1],ymm7[0,2],ymm2[7,5],ymm7[4,6]
-; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm1[0,1,2],xmm6[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 416(%rdi), %ymm15
-; AVX-NEXT:    vmovaps %ymm5, %ymm8
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm15[0,1],ymm5[1,3],ymm15[4,5],ymm5[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[0,2],ymm11[2,0],ymm13[4,6],ymm11[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm5[0,2],ymm0[7,5],ymm5[4,6]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = mem[0,1,2],xmm0[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 864(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[0,1],ymm9[1,3],ymm0[4,5],ymm9[5,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[0,2],ymm7[2,0],ymm0[4,6],ymm7[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[2,1],ymm5[2,0],ymm4[6,5],ymm5[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm12[0],ymm8[2],ymm12[2]
+; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm5[0,1,2],mem[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm12[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm6[0,0],ymm12[7,4],ymm6[4,4]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,0],ymm0[6,4],ymm6[6,4]
-; AVX-NEXT:    vmovaps 544(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 96(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1]
-; AVX-NEXT:    vmovaps 512(%rdi), %xmm7
-; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm7[0,1,2],xmm11[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm10[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm1[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm13[1,3],ymm1[4,5],ymm13[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm2[2,0],ymm11[4,6],ymm2[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 512(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 544(%rdi), %xmm11
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
+; AVX-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm14[2,0],ymm10[5,4],ymm14[6,4]
-; AVX-NEXT:    vmovaps %ymm14, %ymm6
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
-; AVX-NEXT:    vmovaps 64(%rdi), %xmm1
-; AVX-NEXT:    vmovaps 96(%rdi), %xmm4
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[0,1,0,1]
-; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm14 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm10[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 640(%rdi), %ymm3
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm3[0,1],ymm2[1,3],ymm3[4,5],ymm2[5,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[2,0],ymm6[4,6],ymm5[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[1,0],ymm8[2,0],ymm15[5,4],ymm8[6,4]
-; AVX-NEXT:    vmovaps %ymm15, %ymm11
-; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4]
-; AVX-NEXT:    vmovaps 320(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm2[0,1,0,1]
-; AVX-NEXT:    vmovaps 288(%rdi), %xmm3
-; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm15 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovaps 288(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 320(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6]
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm7[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 416(%rdi), %ymm1
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm1[0,1],ymm12[1,3],ymm1[4,5],ymm12[5,7]
+; AVX-NEXT:    vmovaps %ymm12, %ymm7
+; AVX-NEXT:    vmovaps %ymm1, %ymm12
+; AVX-NEXT:    vmovaps %ymm8, %ymm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[0,2],ymm5[2,0],ymm8[4,6],ymm5[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4]
+; AVX-NEXT:    vmovaps 736(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 768(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[3,1],ymm0[0,2],ymm15[7,5],ymm0[4,6]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = xmm14[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 864(%rdi), %ymm13
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm13[0,1],ymm9[1,3],ymm13[4,5],ymm9[5,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4]
-; AVX-NEXT:    vmovaps 768(%rdi), %xmm0
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1]
-; AVX-NEXT:    vmovaps 736(%rdi), %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm8[0,2],ymm14[2,0],ymm8[4,6],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm14[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm10[2,1],ymm6[3,3],ymm10[6,5],ymm6[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm14 = xmm0[0],mem[1],xmm0[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm14 = ymm0[0,0],mem[1,0],ymm0[4,4],mem[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm14[2,0],xmm1[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm14[0,0],ymm6[7,4],ymm14[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,0],ymm1[6,4],ymm14[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm11[0,1,0,1]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm2[0,1,2],xmm14[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm10[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm0[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm0[3,0],ymm11[0,0],ymm0[7,4],ymm11[4,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm11[2,0],ymm6[6,4],ymm11[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm5[0,1,0,1]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm4[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm11[0,0],ymm4[7,4],ymm11[4,4]
+; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm12[1,0],ymm7[2,0],ymm12[5,4],ymm7[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm11[2,0],ymm4[6,4],ymm11[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm7[0,1,0,1]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0],xmm10[1],xmm15[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm8[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm8[3,0],ymm11[0,0],ymm8[7,4],ymm11[4,4]
+; AVX-NEXT:    vmovaps %ymm9, %ymm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[1,0],ymm9[2,0],ymm13[5,4],ymm9[6,4]
+; AVX-NEXT:    vmovaps %ymm13, %ymm9
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm11[2,0],ymm3[6,4],ymm11[6,4]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = mem[0,1,0,1]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm8[0,1,2],xmm11[3]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm1[2,1],ymm0[3,3],ymm1[6,5],ymm0[7,7]
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0],xmm1[1],mem[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm14[1,0],ymm11[2,0],ymm14[5,4],ymm11[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm5[0,1,2],xmm15[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm4[1,0],ymm0[4,4],ymm4[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm14[2,0],xmm7[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm13[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm14 = mem[0],xmm7[1],mem[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4]
-; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,1,2],xmm3[3]
+; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm15 = ymm0[0,0],mem[1,0],ymm0[4,4],mem[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm15[2,0],xmm14[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm14 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm14[2,0],xmm3[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm13[4,5,6,7]
-; AVX-NEXT:    vmovaps %ymm8, %ymm7
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm8[2,1],ymm9[3,3],ymm8[6,5],ymm9[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm9[0],xmm8[1],xmm9[2,3]
+; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm14 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = mem[0],xmm13[1],mem[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[1,0],ymm14[2,0],ymm15[5,4],ymm14[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm15 = xmm6[0,1,2],xmm2[3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm5[1,0],ymm2[4,4],ymm5[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm12[2,1],mem[3,3],ymm12[6,5],mem[7,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm12 = mem[0],xmm0[1],mem[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm3[2,0],ymm12[5,4],ymm3[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm1[2,0],ymm12[5,4],ymm1[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm7[0,1,2],xmm10[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[0,0],ymm14[1,0],ymm1[4,4],ymm14[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm12[2,0],xmm2[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,3]
+; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm15 = ymm14[0,0],mem[1,0],ymm14[4,4],mem[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
+; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm15[2,0],xmm12[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm4[3,3],ymm9[6,5],ymm4[7,7]
+; AVX-NEXT:    vmovaps %ymm9, %ymm12
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm9[0],xmm7[1],xmm9[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm4[0,1,2],xmm8[3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm10[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm5[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[2,3]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm7[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[3,0],ymm2[0,0],ymm7[7,4],ymm2[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm8[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = xmm2[0,1,2],mem[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm4 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm4[2,0],xmm2[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[3,0],ymm4[0,0],ymm1[7,4],ymm4[4,4]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm5[0],mem[1],xmm5[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm0[0,0],ymm3[1,0],ymm0[4,4],ymm3[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm15[2,0],xmm10[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm2[1,0],ymm5[2,0],ymm2[5,4],ymm5[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm11[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[3,0],ymm8[0,0],ymm11[7,4],ymm8[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm13[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,0],ymm8[4,5],ymm2[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm3[2,0],ymm0[5,4],ymm3[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm5[2,0],xmm1[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm12[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm5[0,0],ymm12[7,4],ymm5[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm7[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm4[0,1,2],mem[3]
-; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,0],ymm5[4,5],ymm0[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm1[0,1,2],mem[3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4]
 ; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm1[3,0],ymm5[0,0],ymm1[7,4],ymm5[4,4]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = xmm6[0],mem[1],xmm6[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,0],ymm6[4,5],ymm5[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 96(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 32(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm5, 64(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm5, (%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 96(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 64(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, (%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 96(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 64(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, (%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 96(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 32(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 64(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, (%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 96(%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 32(%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, (%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 64(%r9)
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm5[2,0],xmm1[2,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm3[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm3[3,0],ymm5[0,0],ymm3[7,4],ymm5[4,4]
+; AVX-NEXT:    vpermilps $238, (%rsp), %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,0],ymm8[4,5],ymm5[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm1[0,1,2],mem[3]
+; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm8 = ymm14[1,0],mem[2,0],ymm14[5,4],mem[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm3[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm3[3,0],ymm8[0,0],ymm3[7,4],ymm8[4,4]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 96(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 64(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm7, (%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 96(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 64(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, (%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 96(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 64(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, (%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 96(%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 64(%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, (%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 96(%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, (%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 64(%r9)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovaps %ymm12, 96(%rax)
-; AVX-NEXT:    vmovaps %ymm13, 32(%rax)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 64(%rax)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, (%rax)
+; AVX-NEXT:    vmovaps %ymm10, 96(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 64(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, (%rax)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovaps %ymm4, 32(%rax)
-; AVX-NEXT:    vmovaps %ymm2, (%rax)
+; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
+; AVX-NEXT:    vmovaps %ymm5, (%rax)
 ; AVX-NEXT:    vmovaps %ymm0, 96(%rax)
-; AVX-NEXT:    vmovaps %ymm3, 64(%rax)
-; AVX-NEXT:    addq $1432, %rsp # imm = 0x598
+; AVX-NEXT:    vmovaps %ymm2, 64(%rax)
+; AVX-NEXT:    addq $1512, %rsp # imm = 0x5E8
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -9927,25 +9915,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i32_stride7_vf64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $3176, %rsp # imm = 0xC68
+; AVX-NEXT:    subq $3224, %rsp # imm = 0xC98
 ; AVX-NEXT:    vmovaps 704(%rdi), %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 672(%rdi), %ymm3
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 768(%rdi), %ymm11
+; AVX-NEXT:    vmovaps 768(%rdi), %ymm5
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 256(%rdi), %ymm4
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 320(%rdi), %ymm6
+; AVX-NEXT:    vmovaps 320(%rdi), %ymm8
 ; AVX-NEXT:    vmovaps 304(%rdi), %xmm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2]
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
+; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmovaps 224(%rdi), %xmm10
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovaps 224(%rdi), %xmm4
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 384(%rdi), %xmm1
@@ -9961,12 +9951,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 752(%rdi), %xmm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmovaps 672(%rdi), %xmm15
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovaps 672(%rdi), %xmm7
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 832(%rdi), %xmm1
@@ -9974,8 +9964,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 800(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT:    vmovaps 864(%rdi), %xmm13
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[1]
+; AVX-NEXT:    vmovaps 864(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10016,11 +10007,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT:    vmovaps 1664(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 1664(%rdi), %ymm15
 ; AVX-NEXT:    vmovaps 1648(%rdi), %xmm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2]
+; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 1728(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -10039,24 +10030,23 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovaps (%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovaps (%rdi), %xmm14
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm14
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm5
 ; AVX-NEXT:    vmovaps 80(%rdi), %xmm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2]
-; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 160(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT:    vmovaps 192(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
+; AVX-NEXT:    vmovaps 192(%rdi), %xmm13
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[1]
+; AVX-NEXT:    vmovaps %xmm13, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10066,9 +10056,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovaps 448(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovaps 448(%rdi), %xmm6
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; AVX-NEXT:    vmovaps 544(%rdi), %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10081,9 +10071,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 576(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT:    vmovaps 640(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
+; AVX-NEXT:    vmovaps 640(%rdi), %xmm10
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[1]
+; AVX-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10093,732 +10083,663 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovaps 896(%rdi), %xmm12
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovaps 896(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT:    vmovaps 992(%rdi), %ymm5
+; AVX-NEXT:    vmovaps 992(%rdi), %ymm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 976(%rdi), %xmm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 1056(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 1024(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT:    vmovaps 1088(%rdi), %xmm8
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1]
-; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1]
+; AVX-NEXT:    vmovaps 1088(%rdi), %xmm12
+; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[1]
+; AVX-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1376(%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 1376(%rdi), %ymm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 1344(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX-NEXT:    vmovaps 1344(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT:    vmovaps 1440(%rdi), %ymm4
-; AVX-NEXT:    vmovaps 1424(%rdi), %xmm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 1504(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 1440(%rdi), %ymm11
+; AVX-NEXT:    vmovaps 1424(%rdi), %xmm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm2[0],ymm11[2],ymm2[2]
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 1504(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 1472(%rdi), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT:    vmovaps 1536(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 288(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT:    vmovaps 256(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3]
-; AVX-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm0[1]
+; AVX-NEXT:    vmovaps 1536(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm0[1]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm8[1,1],mem[2,2],ymm8[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT:    vmovaps 256(%rdi), %xmm8
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm8[0],mem[1],xmm8[2,3]
+; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 384(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 352(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = zero,xmm1[1,2],mem[0]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 736(%rdi), %ymm0
+; AVX-NEXT:    vmovaps 352(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm0[2,2],ymm11[5,5],ymm0[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT:    vmovaps 704(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3]
-; AVX-NEXT:    vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[0,0],ymm4[3,3],ymm0[4,4],ymm4[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = zero,xmm4[1,2],mem[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[1,1],mem[2,2],ymm0[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT:    vmovaps 704(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0],xmm7[1],xmm0[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 832(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 800(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[2]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT:    vmovaps 800(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1184(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT:    vmovaps 1152(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 1280(%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1248(%rdi), %ymm2
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[0,0],ymm4[3,3],ymm0[4,4],ymm4[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = zero,xmm4[1,2],mem[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[1,1],mem[2,2],ymm9[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT:    vmovaps 1152(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[2]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1632(%rdi), %ymm0
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0],xmm9[1],xmm0[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 1280(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 1248(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT:    vmovaps 1600(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[0,0],ymm4[3,3],ymm0[4,4],ymm4[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = zero,xmm4[1,2],mem[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[1,1],mem[2,2],ymm15[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT:    vmovaps 1600(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0],xmm7[1],xmm0[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 1728(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1696(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = zero,xmm1[1,2],mem[0]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
+; AVX-NEXT:    vmovaps 1696(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm0[2,2],ymm14[5,5],ymm0[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT:    vmovaps 160(%rdi), %ymm2
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[0,0],ymm4[3,3],ymm0[4,4],ymm4[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = zero,xmm4[1,2],mem[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 128(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm2[3,3],ymm0[4,4],ymm2[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = zero,xmm2[1,2],mem[0]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[1,1],mem[2,2],ymm5[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT:    vmovaps 32(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0],xmm14[1],xmm0[2,3]
+; AVX-NEXT:    vmovaps %xmm14, %xmm15
+; AVX-NEXT:    vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT:    vmovaps 160(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 512(%rdi), %ymm0
+; AVX-NEXT:    vmovaps 128(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm5[3,3],ymm0[4,4],ymm5[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm13[2]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[1,1],mem[2,2],ymm13[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
 ; AVX-NEXT:    vmovaps 480(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0],xmm14[1],xmm0[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm0[0],xmm6[1],xmm0[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm4[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 608(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 576(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = zero,xmm3[1,2],mem[0]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[0,0],ymm6[3,3],ymm1[4,4],ymm6[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX-NEXT:    vinsertps {{.*#+}} xmm6 = zero,xmm6[1,2],xmm10[2]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm6[5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 960(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm10[1,1],mem[2,2],ymm10[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
 ; AVX-NEXT:    vmovaps 928(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm0[0],xmm12[1],xmm0[2,3]
-; AVX-NEXT:    vmovaps %xmm12, %xmm6
-; AVX-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm5[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 1056(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1024(%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm1[0,0],ymm5[3,3],ymm1[4,4],ymm5[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm8[2]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT:    vmovaps 1024(%rdi), %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1408(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6]
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
-; AVX-NEXT:    vmovaps 1376(%rdi), %xmm4
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],ymm0[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm2[0,0],ymm14[3,3],ymm2[4,4],ymm14[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm12[2]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm11[1,1],mem[2,2],ymm11[5,5],mem[6,6]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3]
+; AVX-NEXT:    vmovaps 1376(%rdi), %xmm5
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[1,0],mem[3,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm5[0],xmm3[1],xmm5[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[1,0],mem[3,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm6[3,4,5,6,7]
 ; AVX-NEXT:    vmovaps 1504(%rdi), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1472(%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm1[0,0],ymm8[3,3],ymm1[4,4],ymm8[7,7]
-; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = zero,xmm8[1,2],mem[0]
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm10[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm15[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm11[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm9[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm7[2,3,2,3]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm11[0],ymm15[2],ymm11[2]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
+; AVX-NEXT:    vmovaps 1472(%rdi), %ymm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm0[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,0],ymm2[3,3],ymm4[4,4],ymm2[7,7]
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT:    vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = zero,xmm2[1,2],mem[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
+; AVX-NEXT:    vmovaps 288(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm14[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm14 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm14[0,1,2],mem[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = mem[2,3,2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
+; AVX-NEXT:    vmovaps 736(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
+; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm14[0,1,2],mem[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm12[1],xmm2[2,3]
+; AVX-NEXT:    vmovaps 1184(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm14[2,1],ymm12[2,0],ymm14[6,5],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps %xmm3, %xmm9
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3]
-; AVX-NEXT:    vmovaps %xmm4, %xmm6
-; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm5[0,3],ymm12[7,5],ymm5[4,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm4[2,1],ymm12[2,0],ymm4[6,5],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm14 = ymm9[0],mem[0],ymm9[2],mem[2]
+; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm14[0,1,2],mem[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm7[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; AVX-NEXT:    vmovaps 1632(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm12[0,0],ymm0[5,4],ymm12[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm8[0,2],ymm12[7,5],ymm8[4,6]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = mem[0,1,2],xmm0[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 416(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm10[1,3],ymm0[4,5],ymm10[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm11[0],ymm0[2],ymm11[2]
+; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm14[0,1,2],mem[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,0],xmm0[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm12[0,0],ymm0[5,4],ymm12[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm8[0,2],ymm12[7,5],ymm8[4,6]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = mem[0,1,2],xmm0[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 864(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm2[1,3],ymm0[4,5],ymm2[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[2,1],ymm7[2,0],ymm0[6,5],ymm7[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm12[5,6,7]
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm7 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX-NEXT:    vblendps $8, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm7[0,1,2],mem[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm2[0,0],ymm0[5,4],ymm2[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm2[3,1],ymm8[0,2],ymm2[7,5],ymm8[4,6]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = mem[0,1,2],xmm0[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 1312(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm1[1,3],ymm0[4,5],ymm1[5,7]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; AVX-NEXT:    vmovaps 512(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,0],xmm0[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm13[2,1],ymm7[2,0],ymm13[6,5],ymm7[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm12[5,6,7]
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm7 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm7[0,1,2],mem[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3]
+; AVX-NEXT:    vmovaps 960(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,0],xmm0[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm10[2,1],ymm7[2,0],ymm10[6,5],ymm7[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm15[0],ymm4[2],ymm15[2]
+; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm7[0,1,2],mem[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm1[3,1],ymm8[0,2],ymm1[7,5],ymm8[4,6]
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm13[0,1,2],mem[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 1760(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm11[1,3],ymm0[4,5],ymm11[5,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm15[0,2],ymm12[2,0],ymm15[4,6],ymm12[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm4[0,0],ymm5[5,4],ymm4[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm2[0,2],ymm4[7,5],ymm2[4,6]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm9[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 1536(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7]
-; AVX-NEXT:    vmovaps %ymm3, %ymm15
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[0,2],ymm4[2,0],ymm7[4,6],ymm4[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm14[0,0],ymm1[5,4],ymm14[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[3,1],ymm2[0,2],ymm14[7,5],ymm2[4,6]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm12[0,1,2],mem[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 1088(%rdi), %ymm11
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[0,1],ymm10[1,3],ymm11[4,5],ymm10[5,7]
-; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
+; AVX-NEXT:    vmovaps %xmm5, %xmm7
+; AVX-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 1408(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm9[0,1,2],mem[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 640(%rdi), %ymm8
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,0],xmm2[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm5[1,3],ymm8[4,5],ymm5[5,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[2,1],ymm2[2,0],ymm5[6,5],ymm2[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[0,2],ymm4[2,0],ymm3[4,6],ymm4[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm13[0],ymm3[2],ymm13[2]
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm2[0,1,2],mem[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm6[0,1,2],mem[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm13
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 288(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 320(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 416(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm10[1,3],ymm1[4,5],ymm10[5,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm2[0,2],ymm7[2,0],ymm2[4,6],ymm7[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm4[0,0],ymm2[7,4],ymm4[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4]
-; AVX-NEXT:    vmovaps 64(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 96(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 736(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 768(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm7[2,0],ymm13[5,4],ymm7[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4]
-; AVX-NEXT:    vmovaps 320(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1]
-; AVX-NEXT:    vmovaps 288(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm8[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 864(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm6[1,3],ymm1[4,5],ymm6[5,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm5[2,0],ymm8[5,4],ymm5[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
-; AVX-NEXT:    vmovaps 544(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 1184(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 1216(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX-NEXT:    vmovaps 512(%rdi), %xmm6
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm12[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 1312(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm1[0,1],mem[1,3],ymm1[4,5],mem[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm1[2,0],ymm9[4,6],ymm1[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 1632(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 1664(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
-; AVX-NEXT:    vmovaps 768(%rdi), %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 1760(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm11[1,3],ymm1[4,5],ymm11[5,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 1408(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 1440(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX-NEXT:    vmovaps 736(%rdi), %xmm4
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[3,1],ymm0[0,2],ymm5[7,5],ymm0[4,6]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm7[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 1536(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm13[1,3],ymm1[4,5],ymm13[5,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[2,0],ymm11[5,4],ymm10[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
+; AVX-NEXT:    vmovaps 960(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps 992(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX-NEXT:    vmovaps 960(%rdi), %xmm3
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm12[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT:    vmovaps %xmm14, %xmm12
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm14[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 1088(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm15[1,3],ymm1[4,5],ymm15[5,7]
+; AVX-NEXT:    vmovaps %ymm15, %ymm11
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm1[2,0],ymm4[4,6],ymm1[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 512(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 544(%rdi), %xmm8
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
+; AVX-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
-; AVX-NEXT:    vmovaps 1216(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX-NEXT:    vmovaps 1184(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm6[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 640(%rdi), %ymm13
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7]
+; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm2[2,0],ymm5[4,6],ymm2[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 96(%rdi), %xmm9
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0]
+; AVX-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[3,1],ymm0[0,2],ymm2[7,5],ymm0[4,6]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = xmm4[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm7
+; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm7[0,1],ymm0[1,3],ymm7[4,5],ymm0[5,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm3[0,2],ymm14[2,0],ymm3[4,6],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm3[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm14[0,0],ymm3[7,4],ymm14[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm14[2,0],ymm3[6,4],ymm14[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm9[0,1,0,1]
+; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,1,2],xmm14[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm4[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm0[1,0],ymm10[2,0],ymm0[5,4],ymm10[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm9[2,0],ymm3[6,4],ymm9[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm10[0,1,0,1]
+; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm9 = mem[0,1,2],xmm9[3]
 ; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = mem[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
-; AVX-NEXT:    vmovaps 1440(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1]
-; AVX-NEXT:    vmovaps 1408(%rdi), %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm10 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm3[0,0],ymm5[7,4],ymm3[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1]
+; AVX-NEXT:    vblendps $7, (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,1,2],xmm4[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm6[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4]
-; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm10[2,0],ymm0[6,4],ymm10[6,4]
-; AVX-NEXT:    vmovaps 1664(%rdi), %xmm0
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm0[0,1,0,1]
-; AVX-NEXT:    vmovaps 1632(%rdi), %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm4 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,1,2],xmm4[3]
 ; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm9 = ymm2[2,1],mem[3,3],ymm2[6,5],mem[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm10 = mem[0],xmm2[1],mem[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[1,0],ymm9[2,0],ymm10[5,4],ymm9[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = xmm14[0,1,2],mem[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm2[0,0],ymm12[1,0],ymm2[4,4],ymm12[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm13[2,1],ymm7[3,3],ymm13[6,5],ymm7[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[1,0],ymm8[2,0],ymm9[5,4],ymm8[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = mem[0,1,2],xmm7[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,0],ymm13[1,0],ymm10[4,4],ymm13[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm9[2,0],xmm7[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm7 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = mem[0],xmm8[1],mem[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4]
-; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = mem[0,1,2],xmm6[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm8 = ymm9[0,0],mem[1,0],ymm9[4,4],mem[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm11[2,0],ymm7[5,4],ymm11[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,1,2],xmm4[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm12[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = ymm6[2,1],mem[3,3],ymm6[6,5],mem[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = mem[0],xmm7[1],mem[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm15[0,1,0,1]
+; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,1,2],xmm4[3]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm4 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,1,2],xmm4[3]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[1,0],ymm12[2,0],ymm1[5,4],ymm12[6,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[0,1,0,1]
 ; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm4 = mem[0,1,2],xmm4[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm7 = ymm8[0,0],mem[1,0],ymm8[4,4],mem[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm7[2,0],xmm4[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = mem[0],xmm6[1],mem[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4]
-; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,1,2],xmm3[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm0[2,1],mem[3,3],ymm0[6,5],mem[7,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0],xmm0[1],mem[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm0[0,1,2],mem[3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm9[1,0],ymm0[4,4],ymm9[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm3 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm4 = mem[0],xmm4[1],mem[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm11[0,1,2],mem[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[3,2]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm10[0,1,2],mem[3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm11[0,0],ymm13[1,0],ymm11[4,4],ymm13[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -10829,278 +10750,347 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    # xmm4 = mem[0],xmm4[1],mem[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
-; AVX-NEXT:    vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[0,1,2],xmm1[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[2,1],ymm15[3,3],ymm5[6,5],ymm15[7,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0],xmm3[1],mem[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm3 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = xmm0[0,1,2],mem[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm13[2,0],ymm10[5,4],ymm13[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = xmm0[0,1,2],mem[3]
-; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm3 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm3[0,1,2],mem[3]
-; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm4 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,3]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vblendps $8, (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm4[0,1,2],mem[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[3,0],ymm4[0,0],ymm5[7,4],ymm4[4,4]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,0],ymm8[4,5],ymm4[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = mem[0,1,0,1]
+; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0],xmm4[1],mem[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm4 = xmm4[0,1,2],mem[3]
-; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm8 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm8[2,0],xmm4[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm5[3,0],ymm8[0,0],ymm5[7,4],ymm8[4,4]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm11[3]
+; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0],xmm4[1],mem[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm7[0,1,2],mem[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm9 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm9[2,0],xmm4[2,3]
+; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm6[2,1],ymm14[3,3],ymm6[6,5],ymm14[7,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0],xmm4[1],mem[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm15[0,1,2],mem[3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0],xmm4[1],mem[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm15[0,1,2],mem[3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm14 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm14[2,0],xmm4[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm1[2,1],ymm12[3,3],ymm1[6,5],ymm12[7,7]
+; AVX-NEXT:    vmovaps %ymm1, %ymm12
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0],xmm4[1],mem[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm14[1,0],ymm3[2,0],ymm14[5,4],ymm3[6,4]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm2[0,1,2],mem[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm5[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm5[3,0],ymm9[0,0],ymm5[7,4],ymm9[4,4]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm4[0,0],ymm5[1,0],ymm4[4,4],ymm5[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[2,0],xmm14[3,2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm10[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm10[2,0],xmm2[2,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm0[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[0,0],ymm0[7,4],ymm10[4,4]
 ; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm10 = mem[2,3,2,3]
 ; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm10 = xmm10[0],mem[1],xmm10[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm9[0,1,2],mem[3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm9 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4]
-; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm9[2,0],xmm5[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm7[2,3,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm7[3,0],ymm9[0,0],ymm7[7,4],ymm9[4,4]
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm2[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm11[1,0],ymm13[2,0],ymm11[5,4],ymm13[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm10[2,0],xmm2[2,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm1[3,0],ymm10[0,0],ymm1[7,4],ymm10[4,4]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = xmm11[0],mem[1],xmm11[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = xmm10[0,1,2],mem[3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm10 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[2,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[0,0],ymm1[7,4],ymm10[4,4]
 ; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm10 = mem[2,3,2,3]
 ; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm10 = xmm10[0],mem[1],xmm10[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = mem[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,0],ymm10[4,5],ymm1[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = xmm8[0,1,2],mem[3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm10 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4]
 ; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
-; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm10[2,0],xmm9[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm6[2,3,0,1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm6[3,0],ymm10[0,0],ymm6[7,4],ymm10[4,4]
-; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = mem[2,3,2,3]
-; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm12[0],mem[1],xmm12[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,0],ymm12[4,5],ymm10[6,4]
-; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 192(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 128(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 64(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 224(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm10, 160(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm10, 96(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm10, 32(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 192(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 128(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 64(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, (%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 224(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 160(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 96(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 32(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 192(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 128(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 64(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, (%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 224(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 160(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 96(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 32(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, (%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 64(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 128(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 192(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 224(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 160(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 96(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 32(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 224(%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 192(%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 160(%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 128(%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 96(%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 64(%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 32(%r9)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, (%r9)
-; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 224(%rax)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 192(%rax)
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = xmm11[0],mem[1],xmm11[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
+; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm7[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 160(%rax)
+; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm11 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm11
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm11[2,0],xmm10[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 128(%rax)
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm6[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm6[3,0],ymm11[0,0],ymm6[7,4],ymm11[4,4]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = xmm11[0],mem[1],xmm11[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
+; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,0],ymm11[4,5],ymm7[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = xmm10[0,1,2],mem[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 96(%rax)
+; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm11 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm11
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm11[2,0],xmm10[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 64(%rax)
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm6[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm6[3,0],ymm11[0,0],ymm6[7,4],ymm11[4,4]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm14[0],mem[1],xmm14[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,0],ymm14[4,5],ymm11[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm15[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 32(%rax)
+; AVX-NEXT:    vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm14 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm14[2,0],xmm10[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, (%rax)
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm6[3,0],ymm14[0,0],ymm6[7,4],ymm14[4,4]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm14[0],mem[1],xmm14[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,0],ymm14[4,5],ymm13[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[0,1,0,1]
+; AVX-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = xmm10[0,1,2],mem[3]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4]
+; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm14[2,0],xmm10[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm12[2,3,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm14[0,0],ymm12[7,4],ymm14[4,4]
+; AVX-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm9 = mem[2,3,2,3]
+; AVX-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
+; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,0],ymm9[4,5],ymm14[6,4]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 192(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 128(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 64(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, (%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 224(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm9, 160(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm9, 96(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm9, 32(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 192(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 128(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 64(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, (%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 224(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 160(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 96(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 32(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 192(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 128(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 64(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, (%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 224(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 160(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 96(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 32(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, (%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 64(%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 128(%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 192(%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 224(%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 160(%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 96(%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 32(%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 224(%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 192(%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 160(%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 128(%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 96(%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 64(%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, 32(%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, (%r9)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovaps %ymm9, 224(%rax)
-; AVX-NEXT:    vmovaps %ymm5, 192(%rax)
-; AVX-NEXT:    vmovaps %ymm4, 160(%rax)
-; AVX-NEXT:    vmovaps %ymm8, 128(%rax)
+; AVX-NEXT:    vmovaps %ymm3, 224(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 192(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 160(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 128(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm3, 96(%rax)
-; AVX-NEXT:    vmovaps %ymm0, 64(%rax)
-; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
-; AVX-NEXT:    vmovaps %ymm2, (%rax)
-; AVX-NEXT:    addq $3176, %rsp # imm = 0xC68
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 64(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, (%rax)
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    vmovaps %ymm14, 224(%rax)
+; AVX-NEXT:    vmovaps %ymm13, 192(%rax)
+; AVX-NEXT:    vmovaps %ymm11, 160(%rax)
+; AVX-NEXT:    vmovaps %ymm7, 128(%rax)
+; AVX-NEXT:    vmovaps %ymm8, 96(%rax)
+; AVX-NEXT:    vmovaps %ymm1, 64(%rax)
+; AVX-NEXT:    vmovaps %ymm2, 32(%rax)
+; AVX-NEXT:    vmovaps %ymm0, (%rax)
+; AVX-NEXT:    addq $3224, %rsp # imm = 0xC98
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
index 93a84e30412d6..0e3dab6580318 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
@@ -84,105 +84,113 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512-LABEL: load_i64_stride3_vf2:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm2, (%rdx)
-; AVX512-NEXT:    vmovdqa %xmm1, (%rcx)
+; AVX512-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512-NEXT:    vpermpd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    vmovaps 16(%rdi), %xmm2
+; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
+; AVX512-NEXT:    vmovaps %xmm0, (%rdx)
+; AVX512-NEXT:    vmovaps %xmm2, (%rcx)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i64_stride3_vf2:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm1
-; AVX512-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
-; AVX512-FCP-NEXT:    vmovaps 16(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512-FCP-NEXT:    vpermpd %zmm0, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512-FCP-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
 ; AVX512-FCP-NEXT:    vmovaps %xmm1, (%rsi)
-; AVX512-FCP-NEXT:    vmovaps %xmm0, (%rdx)
-; AVX512-FCP-NEXT:    vmovaps %xmm2, (%rcx)
+; AVX512-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512-FCP-NEXT:    vmovaps %xmm0, (%rcx)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i64_stride3_vf2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; AVX512DQ-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa %xmm1, (%rcx)
+; AVX512DQ-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-NEXT:    vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512DQ-NEXT:    vpermpd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT:    vmovaps 16(%rdi), %xmm2
+; AVX512DQ-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vmovaps %xmm1, (%rsi)
+; AVX512DQ-NEXT:    vmovaps %xmm0, (%rdx)
+; AVX512DQ-NEXT:    vmovaps %xmm2, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i64_stride3_vf2:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm1
-; AVX512DQ-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
-; AVX512DQ-FCP-NEXT:    vmovaps 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512DQ-FCP-NEXT:    vpermpd %zmm0, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512DQ-FCP-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovaps %xmm1, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm2, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm0, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride3_vf2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; AVX512BW-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %xmm2, (%rdx)
-; AVX512BW-NEXT:    vmovdqa %xmm1, (%rcx)
+; AVX512BW-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512BW-NEXT:    vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512BW-NEXT:    vpermpd %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovaps 16(%rdi), %xmm2
+; AVX512BW-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512BW-NEXT:    vmovaps %xmm1, (%rsi)
+; AVX512BW-NEXT:    vmovaps %xmm0, (%rdx)
+; AVX512BW-NEXT:    vmovaps %xmm2, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i64_stride3_vf2:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
-; AVX512BW-FCP-NEXT:    vmovaps 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512BW-FCP-NEXT:    vpermpd %zmm0, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512BW-FCP-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovaps %xmm1, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovaps %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovaps %xmm2, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm0, (%rcx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i64_stride3_vf2:
 ; AVX512DQ-BW:       # %bb.0:
-; AVX512DQ-BW-NEXT:    vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; AVX512DQ-BW-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-BW-NEXT:    vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512DQ-BW-NEXT:    vpermpd %zmm0, %zmm2, %zmm0
+; AVX512DQ-BW-NEXT:    vmovaps 16(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-NEXT:    vmovaps %xmm1, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovaps %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovaps %xmm2, (%rcx)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride3_vf2:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovaps 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512DQ-BW-FCP-NEXT:    vpermpd %zmm0, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512DQ-BW-FCP-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm1, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm2, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm0, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <6 x i64>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
index 0c7c3f4b16646..e6fff38ade14b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
@@ -103,18 +103,18 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512-LABEL: load_i64_stride4_vf2:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX512-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX512-NEXT:    vmovaps (%rdi), %ymm3
-; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512-NEXT:    vmovaps %xmm2, (%rsi)
-; AVX512-NEXT:    vmovaps %xmm0, (%rdx)
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,4]
+; AVX512-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; AVX512-NEXT:    vmovaps (%rdi), %xmm1
+; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
+; AVX512-NEXT:    vmovaps (%rdi), %ymm2
+; AVX512-NEXT:    vmovaps 32(%rdi), %ymm3
+; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512-NEXT:    vmovaps %xmm1, (%rdx)
 ; AVX512-NEXT:    vextractf128 $1, %ymm4, (%rcx)
-; AVX512-NEXT:    vextractf128 $1, %ymm1, (%r8)
+; AVX512-NEXT:    vextractf128 $1, %ymm2, (%r8)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -137,18 +137,18 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-LABEL: load_i64_stride4_vf2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX512DQ-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm3
-; AVX512DQ-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQ-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQ-NEXT:    vmovaps %xmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovaps %xmm0, (%rdx)
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,4]
+; AVX512DQ-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm1
+; AVX512DQ-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
+; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm2
+; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm3
+; AVX512DQ-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512DQ-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512DQ-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512DQ-NEXT:    vmovaps %xmm1, (%rdx)
 ; AVX512DQ-NEXT:    vextractf128 $1, %ymm4, (%rcx)
-; AVX512DQ-NEXT:    vextractf128 $1, %ymm1, (%r8)
+; AVX512DQ-NEXT:    vextractf128 $1, %ymm2, (%r8)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -171,18 +171,18 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-LABEL: load_i64_stride4_vf2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX512BW-NEXT:    vmovaps (%rdi), %ymm3
-; AVX512BW-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-NEXT:    vmovaps %xmm2, (%rsi)
-; AVX512BW-NEXT:    vmovaps %xmm0, (%rdx)
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,4]
+; AVX512BW-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovaps (%rdi), %xmm1
+; AVX512BW-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
+; AVX512BW-NEXT:    vmovaps (%rdi), %ymm2
+; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm3
+; AVX512BW-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512BW-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512BW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT:    vmovaps %xmm1, (%rdx)
 ; AVX512BW-NEXT:    vextractf128 $1, %ymm4, (%rcx)
-; AVX512BW-NEXT:    vextractf128 $1, %ymm1, (%r8)
+; AVX512BW-NEXT:    vextractf128 $1, %ymm2, (%r8)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -205,18 +205,18 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-BW-LABEL: load_i64_stride4_vf2:
 ; AVX512DQ-BW:       # %bb.0:
-; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX512DQ-BW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX512DQ-BW-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm3
-; AVX512DQ-BW-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQ-BW-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQ-BW-NEXT:    vmovaps %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovaps %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,4]
+; AVX512DQ-BW-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %xmm1
+; AVX512DQ-BW-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
+; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm2
+; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm3
+; AVX512DQ-BW-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512DQ-BW-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512DQ-BW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovaps %xmm1, (%rdx)
 ; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm4, (%rcx)
-; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm1, (%r8)
+; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm2, (%r8)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
index 07988a416bac4..3abf5bbaea27b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
@@ -114,154 +114,178 @@ define void @load_i64_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512-LABEL: load_i64_stride5_vf2:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1],mem[2,3]
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,7]
+; AVX512-NEXT:    vpermq %zmm1, %zmm3, %zmm3
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512-NEXT:    vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
 ; AVX512-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdx)
 ; AVX512-NEXT:    vmovdqa %xmm3, (%rcx)
-; AVX512-NEXT:    vmovdqa %xmm1, (%r8)
-; AVX512-NEXT:    vmovdqa %xmm2, (%r9)
+; AVX512-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512-NEXT:    vmovdqa %xmm6, (%r9)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i64_stride5_vf2:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %xmm2
+; AVX512-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,6]
+; AVX512-FCP-NEXT:    vpermq %zmm0, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [2,7]
+; AVX512-FCP-NEXT:    vpermq %zmm0, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vmovaps %xmm2, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%r9)
+; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i64_stride5_vf2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,7]
+; AVX512DQ-NEXT:    vpermq %zmm1, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512DQ-NEXT:    vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512DQ-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
 ; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa %xmm1, (%r8)
-; AVX512DQ-NEXT:    vmovdqa %xmm2, (%r9)
+; AVX512DQ-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512DQ-NEXT:    vmovdqa %xmm6, (%r9)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i64_stride5_vf2:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,6]
+; AVX512DQ-FCP-NEXT:    vpermq %zmm0, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [2,7]
+; AVX512DQ-FCP-NEXT:    vpermq %zmm0, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%r9)
+; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride5_vf2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1],mem[2,3]
+; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,7]
+; AVX512BW-NEXT:    vpermq %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %xmm2, (%rdx)
 ; AVX512BW-NEXT:    vmovdqa %xmm3, (%rcx)
-; AVX512BW-NEXT:    vmovdqa %xmm1, (%r8)
-; AVX512BW-NEXT:    vmovdqa %xmm2, (%r9)
+; AVX512BW-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512BW-NEXT:    vmovdqa %xmm6, (%r9)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i64_stride5_vf2:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm1, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %xmm2
+; AVX512BW-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,6]
+; AVX512BW-FCP-NEXT:    vpermq %zmm0, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [2,7]
+; AVX512BW-FCP-NEXT:    vpermq %zmm0, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512BW-FCP-NEXT:    vmovaps %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
+; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i64_stride5_vf2:
 ; AVX512DQ-BW:       # %bb.0:
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,7]
+; AVX512DQ-BW-NEXT:    vpermq %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512DQ-BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512DQ-BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rdx)
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, (%r8)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%r9)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%r9)
+; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride5_vf2:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm1, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm0, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [2,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm0, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <10 x i64>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <10 x i64> %wide.vec, <10 x i64> poison, <2 x i32> <i32 0, i32 5>
@@ -501,24 +525,24 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4]
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpermi2q %ymm5, %ymm3, %ymm4
-; AVX512-FCP-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0]
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vpbroadcastq %xmm4, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512-FCP-NEXT:    vpbroadcastq 144(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
-; AVX512-FCP-NEXT:    vpbroadcastq 144(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -561,24 +585,24 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpermi2q %ymm5, %ymm3, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm4, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512DQ-FCP-NEXT:    vpbroadcastq 144(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
-; AVX512DQ-FCP-NEXT:    vpbroadcastq 144(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -621,24 +645,24 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0]
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4]
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512BW-FCP-NEXT:    vpermi2q %ymm5, %ymm3, %ymm4
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm3
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0]
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %xmm4
+; AVX512BW-FCP-NEXT:    vpbroadcastq %xmm4, %ymm5
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vpbroadcastq 144(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0]
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
-; AVX512BW-FCP-NEXT:    vpbroadcastq 144(%rdi), %ymm7
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -681,24 +705,24 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm5, %ymm3, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq %xmm4, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq 144(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq 144(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
index 7d3209397c3df..55be6ea0c5722 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
@@ -587,7 +587,7 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
@@ -614,27 +614,26 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpermi2q %ymm6, %ymm4, %ymm5
-; AVX512-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
-; AVX512-FCP-NEXT:    vpermq 128(%rdi), %zmm7, %zmm7
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,10]
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vpbroadcastq %xmm5, %ymm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,1,8,14]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [4,10]
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
+; AVX512-FCP-NEXT:    vpermt2q 128(%rdi), %zmm6, %zmm7
+; AVX512-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%r9)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -669,7 +668,7 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512DQ-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512DQ-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
@@ -696,27 +695,26 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpermi2q %ymm6, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
-; AVX512DQ-FCP-NEXT:    vpermq 128(%rdi), %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,10]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm5, %ymm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,1,8,14]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [4,10]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermt2q 128(%rdi), %zmm6, %zmm7
+; AVX512DQ-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
@@ -751,7 +749,7 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512BW-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
@@ -778,27 +776,26 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
-; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vpermi2q %ymm6, %ymm4, %ymm5
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm4
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm7
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
-; AVX512BW-FCP-NEXT:    vpermq 128(%rdi), %zmm7, %zmm7
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,10]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
+; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %xmm5
+; AVX512BW-FCP-NEXT:    vpbroadcastq %xmm5, %ymm6
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm6
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,1,8,14]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [4,10]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
+; AVX512BW-FCP-NEXT:    vpermt2q 128(%rdi), %zmm6, %zmm7
+; AVX512BW-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -833,7 +830,7 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512DQ-BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
@@ -860,27 +857,26 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm6, %ymm4, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermq 128(%rdi), %zmm7, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,10]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq %xmm5, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,1,8,14]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [4,10]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q 128(%rdi), %zmm6, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq 136(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [5,11]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index cc3e5f3d1d82e..0d4e1a831cd71 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -48,26 +48,25 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX-NEXT:    vmovapd 16(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps 48(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm2
-; AVX-NEXT:    vmovapd 80(%rdi), %xmm3
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX-NEXT:    vblendps {{.*#+}} xmm4 = mem[0,1],xmm1[2,3]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} xmm5 = xmm0[0],mem[1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
-; AVX-NEXT:    vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],mem[4,5,6,7]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = mem[0,1,2,3],xmm3[4,5,6,7]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
 ; AVX-NEXT:    vmovaps %xmm4, (%rsi)
 ; AVX-NEXT:    vmovdqa %xmm2, (%rdx)
-; AVX-NEXT:    vmovapd %xmm5, (%rcx)
-; AVX-NEXT:    vmovapd %xmm0, (%r8)
-; AVX-NEXT:    vmovapd %xmm3, (%r9)
+; AVX-NEXT:    vmovdqa %xmm5, (%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, (%r8)
+; AVX-NEXT:    vmovdqa %xmm3, (%r9)
 ; AVX-NEXT:    vmovdqa %xmm6, (%r10)
 ; AVX-NEXT:    vmovaps %xmm1, (%rax)
-; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: load_i64_stride7_vf2:
@@ -155,216 +154,236 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3]
 ; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm6
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3]
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512-NEXT:    vmovdqa %xmm5, (%rcx)
-; AVX512-NEXT:    vmovdqa %xmm1, (%r8)
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512-NEXT:    vpermi2q 64(%rdi), %zmm0, %zmm7
+; AVX512-NEXT:    vmovdqa %xmm5, (%rsi)
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX512-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512-NEXT:    vmovdqa %xmm2, (%r8)
 ; AVX512-NEXT:    vmovdqa %xmm6, (%r9)
-; AVX512-NEXT:    vmovdqa %xmm2, (%r10)
-; AVX512-NEXT:    vmovdqa %xmm3, (%rax)
+; AVX512-NEXT:    vmovdqa %xmm3, (%r10)
+; AVX512-NEXT:    vmovdqa %xmm7, (%rax)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i64_stride7_vf2:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512-FCP-NEXT:    vpermq %zmm1, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, (%r8)
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,12]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm7, %zmm8
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%r9)
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%r10)
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%r10)
+; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%rax)
+; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i64_stride7_vf2:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3]
 ; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm6
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
+; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3]
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa %xmm1, (%r8)
+; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512DQ-NEXT:    vpermi2q 64(%rdi), %zmm0, %zmm7
+; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rsi)
+; AVX512DQ-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa %xmm2, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa %xmm6, (%r9)
-; AVX512DQ-NEXT:    vmovdqa %xmm2, (%r10)
-; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rax)
+; AVX512DQ-NEXT:    vmovdqa %xmm3, (%r10)
+; AVX512DQ-NEXT:    vmovdqa %xmm7, (%rax)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf2:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512DQ-FCP-NEXT:    vpermq %zmm1, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%r8)
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,12]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm7, %zmm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%r10)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%r10)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%rax)
+; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride7_vf2:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
+; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3]
 ; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm6
-; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
+; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3]
 ; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512BW-NEXT:    vmovdqa %xmm5, (%rcx)
-; AVX512BW-NEXT:    vmovdqa %xmm1, (%r8)
+; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512BW-NEXT:    vpermi2q 64(%rdi), %zmm0, %zmm7
+; AVX512BW-NEXT:    vmovdqa %xmm5, (%rsi)
+; AVX512BW-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512BW-NEXT:    vmovdqa %xmm2, (%r8)
 ; AVX512BW-NEXT:    vmovdqa %xmm6, (%r9)
-; AVX512BW-NEXT:    vmovdqa %xmm2, (%r10)
-; AVX512BW-NEXT:    vmovdqa %xmm3, (%rax)
+; AVX512BW-NEXT:    vmovdqa %xmm3, (%r10)
+; AVX512BW-NEXT:    vmovdqa %xmm7, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i64_stride7_vf2:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512BW-FCP-NEXT:    vpermq %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa 80(%rdi), %xmm6
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm1, (%r8)
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,12]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm7, %zmm8
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%r10)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm8, (%r10)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm7, (%rax)
+; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i64_stride7_vf2:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
+; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa 80(%rdi), %xmm6
-; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
+; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, (%r8)
+; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512DQ-BW-NEXT:    vpermi2q 64(%rdi), %zmm0, %zmm7
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%r8)
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%r9)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%r10)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%r10)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm7, (%rax)
+; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf2:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 80(%rdi), %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm1, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,12]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm7, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm8, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <14 x i64>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> <i32 0, i32 7>
@@ -448,7 +467,7 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd 16(%rdi), %xmm7
 ; AVX-NEXT:    vmovapd 48(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX-NEXT:    vmovapd 80(%rdi), %xmm9
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm9
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm10 = mem[0],xmm3[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3]
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[3],ymm4[2]
@@ -459,14 +478,14 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm10 = xmm7[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm8
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3]
-; AVX-NEXT:    vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = mem[0,1,2,3],xmm9[4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3]
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm9
@@ -478,7 +497,7 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd %ymm6, (%rsi)
 ; AVX-NEXT:    vmovapd %ymm5, (%rdx)
 ; AVX-NEXT:    vmovapd %ymm4, (%rcx)
-; AVX-NEXT:    vmovapd %ymm7, (%r8)
+; AVX-NEXT:    vmovaps %ymm7, (%r8)
 ; AVX-NEXT:    vmovapd %ymm8, (%r9)
 ; AVX-NEXT:    vmovapd %ymm2, (%r10)
 ; AVX-NEXT:    vmovapd %ymm0, (%rax)
@@ -698,51 +717,51 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
 ; AVX512-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
-; AVX512-FCP-NEXT:    vpermq %zmm3, %zmm6, %zmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %xmm7
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,8,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [2,5]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm7
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512-FCP-NEXT:    vpermi2q %ymm8, %ymm7, %ymm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [5,12]
-; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm9
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
-; AVX512-FCP-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [6,13]
-; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermi2q 192(%rdi), %zmm2, %zmm9
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%r9)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r10)
-; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -803,51 +822,51 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
-; AVX512DQ-FCP-NEXT:    vpermq %zmm3, %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,8,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [2,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512DQ-FCP-NEXT:    vpermi2q %ymm8, %ymm7, %ymm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [5,12]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [6,13]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2q 192(%rdi), %zmm2, %zmm9
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r10)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -908,51 +927,51 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm1
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
-; AVX512BW-FCP-NEXT:    vpermq %zmm3, %zmm6, %zmm6
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm7
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm3, %zmm1
+; AVX512BW-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm5
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,8,15]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [2,5]
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm7
+; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vpermi2q %ymm8, %ymm7, %ymm6
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm5
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,11]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [5,12]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm9
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
-; AVX512BW-FCP-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [6,13]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpermi2q 192(%rdi), %zmm2, %zmm9
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%r9)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%r10)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -1013,51 +1032,51 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm3, %zmm6, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq 176(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,1,8,15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [2,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm8, %ymm7, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [4,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [5,12]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q 192(%rdi), %zmm2, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <28 x i64>, ptr %in.vec, align 64
@@ -1214,7 +1233,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd 48(%rdi), %xmm1
 ; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm12
-; AVX-NEXT:    vmovapd 80(%rdi), %xmm10
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm10
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm3 = mem[0],xmm1[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1243,28 +1262,28 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd 240(%rdi), %xmm14
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm15 = xmm14[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm11[1],ymm10[0],ymm11[2],ymm10[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
-; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3]
-; AVX-NEXT:    vmovapd 304(%rdi), %xmm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm14[1],ymm13[0],ymm14[2],ymm13[3]
-; AVX-NEXT:    vmovdqa 416(%rdi), %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7]
+; AVX-NEXT:    vmovdqa 416(%rdi), %xmm13
+; AVX-NEXT:    vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3]
+; AVX-NEXT:    vmovdqa 304(%rdi), %xmm1
+; AVX-NEXT:    vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm14
 ; AVX-NEXT:    vmovapd 128(%rdi), %ymm15
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3]
-; AVX-NEXT:    vblendpd {{.*#+}} xmm10 = mem[0],xmm10[1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = mem[0,1,2,3],xmm10[4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
 ; AVX-NEXT:    vmovapd 352(%rdi), %ymm14
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3]
-; AVX-NEXT:    vmovapd 256(%rdi), %xmm1
-; AVX-NEXT:    vblendpd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3]
+; AVX-NEXT:    vmovdqa 256(%rdi), %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3]
 ; AVX-NEXT:    vmovapd 192(%rdi), %ymm13
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm15[0],ymm13[1],ymm15[3],ymm13[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm5
@@ -1273,8 +1292,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm15
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[3],ymm15[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3]
 ; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm13 = mem[0,1,2],ymm13[3]
 ; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -1291,12 +1310,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd %ymm6, (%rdx)
 ; AVX-NEXT:    vmovapd %ymm9, 32(%rcx)
 ; AVX-NEXT:    vmovapd %ymm8, (%rcx)
-; AVX-NEXT:    vmovapd %ymm12, 32(%r8)
-; AVX-NEXT:    vmovapd %ymm11, (%r8)
-; AVX-NEXT:    vmovapd %ymm2, 32(%r9)
+; AVX-NEXT:    vmovaps %ymm12, 32(%r8)
+; AVX-NEXT:    vmovaps %ymm11, (%r8)
+; AVX-NEXT:    vmovapd %ymm1, 32(%r9)
 ; AVX-NEXT:    vmovapd %ymm10, (%r9)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovapd %ymm1, 32(%rax)
+; AVX-NEXT:    vmovapd %ymm2, 32(%rax)
 ; AVX-NEXT:    vmovapd %ymm5, (%rax)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    vmovapd %ymm0, 32(%rax)
@@ -1730,106 +1749,107 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm9
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
-; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
-; AVX512-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm6, %zmm3
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
-; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm7, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,7,14,0,0,7,14,0]
+; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
+; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [2,5]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm9
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX512-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm13
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm14 = [4,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm14 = [5,12]
-; AVX512-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm14
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm12
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm8, %zmm10
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm8
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
-; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm8, %zmm9
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm15, %zmm9, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm10
+; AVX512-FCP-NEXT:    vpermi2q %ymm11, %ymm9, %ymm8
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [4,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [5,12]
+; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm12
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm11 = [6,13]
+; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm11
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm5, %zmm7
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,5,6,13,4,5,6,13]
+; AVX512-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = ymm10[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-FCP-NEXT:    movb $24, %r10b
-; AVX512-FCP-NEXT:    kmovw %r10d, %k2
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3]
-; AVX512-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm13
+; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm13
+; AVX512-FCP-NEXT:    movb $24, %dil
+; AVX512-FCP-NEXT:    kmovw %edi, %k2
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm13[4,5,4,5],zmm10[4,5,4,5]
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3]
+; AVX512-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm14
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9]
 ; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm13, %zmm15
-; AVX512-FCP-NEXT:    movb $-32, %r10b
-; AVX512-FCP-NEXT:    kmovw %r10d, %k1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10 {%k1}
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
-; AVX512-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm14, %zmm15
+; AVX512-FCP-NEXT:    movb $-32, %dil
+; AVX512-FCP-NEXT:    kmovw %edi, %k1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [13,6,13,6,13,6,13,6]
+; AVX512-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [14,0,0,7,14,0,0,7]
 ; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k2}
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
-; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
-; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
-; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
-; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm13
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
-; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm15, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm4
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpermi2q %zmm10, %zmm13, %zmm15
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermi2q %zmm13, %zmm10, %zmm16
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm12 = [0,0,4,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm10, %zmm13, %zmm12
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm14, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3 {%k2}
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4]
+; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,10,0,5,6,10]
+; AVX512-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm10, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3 {%k1}
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5]
+; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,11,0,5,6,11]
+; AVX512-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm10, %zmm13
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm8, %zmm15, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm8 {%k1}
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm14
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,12,0,5,6,12]
+; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm14, %zmm10
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm9, %zmm16, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k1}
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
+; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm10
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm4
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
 ; AVX512-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
 ; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm12[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rdi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -1948,106 +1968,107 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm6, %zmm3
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [2,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm14 = [4,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm14 = [5,12]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm14
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm12
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm8, %zmm10
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm8, %zmm9
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm15, %zmm9, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT:    vpermi2q %ymm11, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [4,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [5,12]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm12
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm11 = [6,13]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm11
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm5, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = ymm10[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512DQ-FCP-NEXT:    movb $24, %r10b
-; AVX512DQ-FCP-NEXT:    kmovw %r10d, %k2
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm13
+; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT:    movb $24, %dil
+; AVX512DQ-FCP-NEXT:    kmovw %edi, %k2
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm13[4,5,4,5],zmm10[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm14
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9]
 ; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm13, %zmm15
-; AVX512DQ-FCP-NEXT:    movb $-32, %r10b
-; AVX512DQ-FCP-NEXT:    kmovw %r10d, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm14, %zmm15
+; AVX512DQ-FCP-NEXT:    movb $-32, %dil
+; AVX512DQ-FCP-NEXT:    kmovw %edi, %k1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [14,0,0,7,14,0,0,7]
 ; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k2}
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm13
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm15, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm10, %zmm13, %zmm15
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm13, %zmm10, %zmm16
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm12 = [0,0,4,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm10, %zmm13, %zmm12
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm14, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm10, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm10, %zmm13
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm8, %zmm15, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm8 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm14
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm14, %zmm10
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm9, %zmm16, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm4
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
 ; AVX512DQ-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
 ; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm12[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rdi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -2166,105 +2187,106 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
-; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,7,14,0,0,7,14,0]
+; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
+; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm4, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
+; AVX512BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm9
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm10 = [4,11]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm10
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm10 = [5,12]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm10
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [6,13]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm9
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm5, %zmm7
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm5
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,5,6,13,4,5,6,13]
 ; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm6, %zmm3
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
-; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm11
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm7, %zmm7
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm12
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [4,11]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm13
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [5,12]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm13
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm12
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm8, %zmm10
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm8
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
-; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm8, %zmm9
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm14, %zmm9, %zmm8
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm5, %zmm6
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm6, %zmm5
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm12
 ; AVX512BW-FCP-NEXT:    movb $24, %r11b
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k2
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3]
-; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm9
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9]
-; AVX512BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm9, %zmm14
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm12[4,5,4,5],zmm11[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,9,0,5,6,9]
+; AVX512BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm13
 ; AVX512BW-FCP-NEXT:    movb $-32, %r11b
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm10 {%k1}
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6]
-; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm7 {%k1}
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
+; AVX512BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [14,0,0,7,14,0,0,7]
 ; AVX512BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm15
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm13
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k2}
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
-; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
-; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
-; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
-; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm9
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
-; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm9, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm5
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm14, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm4
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
-; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm11, %zmm12, %zmm14
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
+; AVX512BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm12, %zmm11, %zmm15
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = ymm8[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,0,4,11]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm11, %zmm12, %zmm6
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm13, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3 {%k2}
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4]
+; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,10,0,5,6,10]
+; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm10, %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5]
+; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,11,0,5,6,11]
+; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm10, %zmm11
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm10 = [2,5]
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm16
+; AVX512BW-FCP-NEXT:    vpermi2q %ymm16, %ymm12, %ymm10
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm14, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k1}
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm13
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,12,0,5,6,12]
+; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm15, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14]
+; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm11
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm4
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [2,9,2,9,2,9,2,9]
+; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm8
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
 ; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm8, %zmm1
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, (%r9)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -2384,105 +2406,106 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm4, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm10 = [4,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm10 = [5,12]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm5, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,5,6,13,4,5,6,13]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm6, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm7, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [4,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [5,12]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm9, %zmm10, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm8, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm8, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm14, %zmm9, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm5, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm6, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    movb $24, %r11b
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k2
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm9, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm12[4,5,4,5],zmm11[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    movb $-32, %r11b
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [14,0,0,7,14,0,0,7]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm5, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm9, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm14, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
-; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm11, %zmm12, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm12, %zmm11, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = ymm8[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm6 = [0,0,4,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm11, %zmm12, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm13, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm10, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm10, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm10 = [2,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm16, %ymm12, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm14, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm15, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm8, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
@@ -2828,179 +2851,179 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 160(%rdi), %ymm2
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
-; AVX-NEXT:    vmovdqa 64(%rdi), %xmm6
-; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
+; AVX-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 608(%rdi), %ymm8
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[3],ymm8[2]
-; AVX-NEXT:    vmovdqa 512(%rdi), %xmm9
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 608(%rdi), %ymm6
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2]
+; AVX-NEXT:    vmovdqa 512(%rdi), %xmm8
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 352(%rdi), %xmm1
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3]
-; AVX-NEXT:    vmovapd 240(%rdi), %xmm3
-; AVX-NEXT:    vblendpd {{.*#+}} xmm5 = xmm3[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 800(%rdi), %xmm7
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3]
-; AVX-NEXT:    vmovapd 688(%rdi), %xmm5
-; AVX-NEXT:    vblendpd {{.*#+}} xmm4 = xmm5[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 352(%rdi), %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3]
+; AVX-NEXT:    vmovapd 240(%rdi), %xmm1
+; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm1[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3]
+; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 800(%rdi), %xmm5
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm7
+; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3]
+; AVX-NEXT:    vmovapd 688(%rdi), %xmm7
+; AVX-NEXT:    vblendpd {{.*#+}} xmm9 = xmm7[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3]
+; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 128(%rdi), %xmm4
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3]
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm10
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm6[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 576(%rdi), %xmm2
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm9
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3]
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm9
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 576(%rdi), %xmm10
+; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm2
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3]
 ; AVX-NEXT:    vmovdqa 464(%rdi), %xmm6
-; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm9[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 752(%rdi), %xmm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[3]
-; AVX-NEXT:    vmovdqa 864(%rdi), %xmm5
-; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3]
-; AVX-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovapd 304(%rdi), %xmm7
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3]
-; AVX-NEXT:    vmovdqa 416(%rdi), %xmm8
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 80(%rdi), %xmm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[3]
-; AVX-NEXT:    vmovdqa 192(%rdi), %xmm10
-; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 528(%rdi), %xmm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[3]
-; AVX-NEXT:    vmovdqa 640(%rdi), %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendpd {{.*#+}} ymm13 = ymm6[0,1],ymm2[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm2
-; AVX-NEXT:    vmovapd 352(%rdi), %ymm6
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3]
-; AVX-NEXT:    vmovapd 256(%rdi), %xmm8
-; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm8[0],xmm7[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm2
-; AVX-NEXT:    vmovapd 800(%rdi), %ymm3
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3]
-; AVX-NEXT:    vmovapd 704(%rdi), %xmm5
-; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm5[0],xmm12[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm7[0,1],ymm2[2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm8[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 864(%rdi), %xmm3
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm5
+; AVX-NEXT:    vmovdqa 752(%rdi), %xmm12
+; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 416(%rdi), %xmm5
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vmovapd 576(%rdi), %ymm7
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3]
-; AVX-NEXT:    vmovapd 480(%rdi), %xmm1
-; AVX-NEXT:    vblendpd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm4[0,1],ymm0[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm0
-; AVX-NEXT:    vmovapd 128(%rdi), %ymm10
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3]
-; AVX-NEXT:    vblendpd {{.*#+}} xmm4 = mem[0],xmm11[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovapd 416(%rdi), %ymm2
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[1],ymm6[3],ymm2[2]
+; AVX-NEXT:    vmovdqa 304(%rdi), %xmm7
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 192(%rdi), %xmm11
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm4
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm9[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT:    vmovdqa 640(%rdi), %xmm4
+; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT:    vmovdqa 528(%rdi), %xmm0
+; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX-NEXT:    vmovapd 352(%rdi), %ymm6
+; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3]
+; AVX-NEXT:    vmovdqa 256(%rdi), %xmm8
+; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2,3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT:    vmovapd 800(%rdi), %ymm2
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3]
+; AVX-NEXT:    vmovdqa 704(%rdi), %xmm5
+; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm12[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm10 = ymm7[0,1],ymm3[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm3
+; AVX-NEXT:    vmovapd 576(%rdi), %ymm12
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3]
+; AVX-NEXT:    vmovdqa 480(%rdi), %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm0[0,1],ymm3[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm0
+; AVX-NEXT:    vmovapd 128(%rdi), %ymm11
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = mem[0,1,2,3],xmm13[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm13 = ymm3[0,1],ymm0[2,3]
+; AVX-NEXT:    vmovapd 416(%rdi), %ymm4
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovapd 864(%rdi), %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[3],ymm4[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm3[2,3]
+; AVX-NEXT:    vmovapd 864(%rdi), %ymm3
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[3],ymm3[2]
 ; AVX-NEXT:    vmovdqa 768(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3]
-; AVX-NEXT:    vmovdqa 544(%rdi), %xmm3
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 640(%rdi), %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[1],ymm7[3],ymm3[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3]
-; AVX-NEXT:    vmovapd 192(%rdi), %ymm1
-; AVX-NEXT:    vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[3],ymm1[2]
-; AVX-NEXT:    vmovdqa 96(%rdi), %xmm15
-; AVX-NEXT:    vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3]
-; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = mem[0,1,2],ymm2[3]
-; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm5[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovdqa 544(%rdi), %xmm2
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 640(%rdi), %ymm5
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm12[0],ymm5[1],ymm12[3],ymm5[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovapd 192(%rdi), %ymm2
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm11[0],ymm2[1],ymm11[3],ymm2[2]
+; AVX-NEXT:    vmovdqa 96(%rdi), %xmm11
+; AVX-NEXT:    vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3]
 ; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm4 = mem[0,1,2],ymm4[3]
-; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
-; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = mem[0,1,2],ymm1[3]
-; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3]
+; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3]
 ; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm3 = mem[0,1,2],ymm3[3]
-; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 64(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, (%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 96(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 32(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 64(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, (%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 96(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 32(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 64(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, (%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 96(%rcx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 32(%rcx)
-; AVX-NEXT:    vmovapd %ymm13, 64(%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, (%r8)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 32(%r8)
-; AVX-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm4, 96(%r8)
-; AVX-NEXT:    vmovapd %ymm11, (%r9)
+; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3]
+; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = mem[0,1,2],ymm2[3]
+; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = mem[0,1,2],ymm5[3]
+; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vblendpd {{.*#+}} xmm5 = xmm5[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 64(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, (%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 96(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 32(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 64(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, (%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 96(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 32(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 64(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, (%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 96(%rcx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 32(%rcx)
+; AVX-NEXT:    vmovaps %ymm14, 64(%r8)
+; AVX-NEXT:    vmovaps %ymm15, (%r8)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 32(%r8)
+; AVX-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 96(%r8)
+; AVX-NEXT:    vmovapd %ymm13, (%r9)
 ; AVX-NEXT:    vmovapd %ymm9, 64(%r9)
-; AVX-NEXT:    vmovapd %ymm12, 96(%r9)
-; AVX-NEXT:    vmovapd %ymm14, 32(%r9)
+; AVX-NEXT:    vmovapd %ymm10, 96(%r9)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm5, 32(%r9)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovapd %ymm10, (%rax)
-; AVX-NEXT:    vmovapd %ymm7, 64(%rax)
-; AVX-NEXT:    vmovapd %ymm5, 96(%rax)
+; AVX-NEXT:    vmovapd %ymm1, (%rax)
+; AVX-NEXT:    vmovapd %ymm12, 64(%rax)
+; AVX-NEXT:    vmovapd %ymm7, 96(%rax)
 ; AVX-NEXT:    vmovapd %ymm8, 32(%rax)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    vmovapd %ymm3, 64(%rax)
-; AVX-NEXT:    vmovapd %ymm1, (%rax)
+; AVX-NEXT:    vmovapd %ymm2, (%rax)
 ; AVX-NEXT:    vmovapd %ymm0, 96(%rax)
-; AVX-NEXT:    vmovapd %ymm2, 32(%rax)
+; AVX-NEXT:    vmovapd %ymm4, 32(%rax)
 ; AVX-NEXT:    addq $552, %rsp # imm = 0x228
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -3892,153 +3915,158 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm30
+; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm31
 ; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm26
-; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm27
-; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm28
+; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm13
 ; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm15
 ; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm29
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm30
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm9
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm28
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm29
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
 ; AVX512-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm31
-; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm18, %zmm31
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm24
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm18, %zmm24
 ; AVX512-FCP-NEXT:    movb $24, %r11b
 ; AVX512-FCP-NEXT:    kmovw %r11d, %k2
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5]
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
-; AVX512-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm14
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm16, %zmm14
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
-; AVX512-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm17, %zmm14
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm5, %zmm14
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,5,6,9,0,5,6,9]
+; AVX512-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm16, %zmm14
 ; AVX512-FCP-NEXT:    movb $-32, %r11b
 ; AVX512-FCP-NEXT:    kmovw %r11d, %k1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm31 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm18, %zmm14
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5]
-; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm16
-; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm17, %zmm16
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k1}
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
-; AVX512-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm24, %zmm17
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
-; AVX512-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm16
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm20, %zmm16
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k2}
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
-; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm24 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm18, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm0[4,5,4,5]
+; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6]
+; AVX512-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm22, %zmm5
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [9,0,7,0,9,0,7,0]
+; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm16
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm19, %zmm16
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k2}
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [11,4,11,4,11,4,11,4]
+; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm19, %zmm17
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
-; AVX512-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm21, %zmm17
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm5, %zmm17
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10]
+; AVX512-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm20, %zmm17
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm22
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm24, %zmm22
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm17
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm19
-; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm21, %zmm19
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm21
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm17
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm19, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm20, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
+; AVX512-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm21
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm22, %zmm21
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
-; AVX512-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm21
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm21
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,5,6,11,0,5,6,11]
 ; AVX512-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm19
-; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm25, %zmm19
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm19, %zmm19
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm19 {%k1}
-; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm22
-; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm23, %zmm22
-; AVX512-FCP-NEXT:    vpermi2q %zmm26, %zmm3, %zmm25
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm25, %zmm21
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [14,0,0,7,14,0,0,7]
+; AVX512-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm20
+; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm26, %zmm20
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm27 = [2,5]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm4[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vpermt2q %ymm11, %ymm27, %ymm4
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm20, %zmm20
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20 {%k1}
+; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm2, %zmm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm25, %zmm23
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 464(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm25, %zmm21
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm21 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm24, %zmm5
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
-; AVX512-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm5
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
-; AVX512-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm22
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm22
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm22, %zmm22
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm22 {%k1}
-; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm24
-; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm23, %zmm24
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm26, %zmm25
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm3, %zmm26
 ; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqa 448(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpermi2q %ymm4, %ymm11, %ymm27
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm27, %zmm26, %zmm21
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm21 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm22, %zmm25
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,5,6,12,0,5,6,12]
+; AVX512-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm26, %zmm25
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [7,0,9,0,7,0,9,0]
+; AVX512-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm27, %zmm23
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm23, %zmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm23 {%k1}
+; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm2, %zmm22
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm26, %zmm22
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm0, %zmm27
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm25, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k1}
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm27, %zmm25
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm25 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm4
 ; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm18, %zmm4
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
-; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm5, %zmm4
-; AVX512-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,13,4,5,6,13]
+; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm4
+; AVX512-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm12[6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [4,11]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm3, %zmm10
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm4, %zmm24
-; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm18
-; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm5, %zmm18
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm3, %zmm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm4, %zmm22
+; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm2, %zmm18
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm11, %zmm18
 ; AVX512-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpermi2q %zmm27, %zmm2, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512-FCP-NEXT:    vpermi2q %zmm28, %zmm7, %zmm3
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm3
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm18
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm4
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm20, %zmm4
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
-; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm10, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm11
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm19, %zmm4
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14]
+; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm12
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [5,12]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm20
-; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm20
-; AVX512-FCP-NEXT:    vmovdqa 640(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vpermi2q %zmm27, %zmm2, %zmm12
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm6, %zmm19
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm11, %zmm19
+; AVX512-FCP-NEXT:    vmovdqa 640(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm10[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vpermi2q %zmm28, %zmm0, %zmm12
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm20, %zmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm19, %zmm5
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
 ; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm7, %zmm8
@@ -4046,28 +4074,28 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm10, %zmm8
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm11, %zmm9
+; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm11, %zmm9
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm10, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm11, %zmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm12, %zmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm2, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, 64(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 64(%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 64(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, (%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%r10)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
@@ -4266,153 +4294,158 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm27
-; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm15
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm29
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
 ; AVX512DQ-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm31
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm18, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm24
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm18, %zmm24
 ; AVX512DQ-FCP-NEXT:    movb $24, %r11b
 ; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k2
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm14
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm16, %zmm14
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm17, %zmm14
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm5, %zmm14
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm16, %zmm14
 ; AVX512DQ-FCP-NEXT:    movb $-32, %r11b
 ; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm31 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm18, %zmm14
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm17, %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k1}
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm24, %zmm17
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm16
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm20, %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k2}
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm24 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm18, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm0[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm22, %zmm5
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm16
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm19, %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k2}
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm19, %zmm17
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm21, %zmm17
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm5, %zmm17
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm20, %zmm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm22
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm24, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm17
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm19
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm21, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm17
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm19, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm20, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm21
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm22, %zmm21
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm21
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm21
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,5,6,11,0,5,6,11]
 ; AVX512DQ-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm19
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm25, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm19, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm19 {%k1}
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm23, %zmm22
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm26, %zmm3, %zmm25
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm25, %zmm21
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm20
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm26, %zmm20
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm27 = [2,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm4[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm11, %ymm27, %ymm4
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm20, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm2, %zmm23
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm25, %zmm23
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 464(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm25, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm21 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm24, %zmm5
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm5
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm22
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm22, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm24
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm23, %zmm24
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm26, %zmm25
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm3, %zmm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqa 448(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT:    vpermi2q %ymm4, %ymm11, %ymm27
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm27, %zmm26, %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm21 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm22, %zmm25
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm26, %zmm25
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm23
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm27, %zmm23
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm23, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm23 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm2, %zmm22
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm26, %zmm22
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm0, %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm25, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k1}
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm27, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm25 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm18, %zmm4
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm5, %zmm4
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm12[6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [4,11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm3, %zmm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm4, %zmm24
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm5, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm3, %zmm10
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm4, %zmm22
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm2, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm11, %zmm18
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm27, %zmm2, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm28, %zmm7, %zmm3
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm3
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm20, %zmm4
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm10, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm19, %zmm4
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [5,12]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm20
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 640(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm27, %zmm2, %zmm12
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm6, %zmm19
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm11, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa 640(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm10[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm28, %zmm0, %zmm12
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm20, %zmm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm19, %zmm5
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
 ; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm7, %zmm8
@@ -4420,28 +4453,28 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm10, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm11, %zmm9
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm11, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm10, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm11, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm12, %zmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm2, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, 64(%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 64(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 64(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%r10)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
@@ -4640,186 +4673,189 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm12
 ; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm26
+; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm11
 ; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm15
 ; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm30
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm24
 ; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm29
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
 ; AVX512BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm31
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm29, %zmm18, %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm18, %zmm3
 ; AVX512BW-FCP-NEXT:    movb $24, %r11b
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k2
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
-; AVX512BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm9[4,5,4,5],zmm24[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm14
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm16, %zmm14
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
-; AVX512BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm17, %zmm14
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm5, %zmm14
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,5,6,9,0,5,6,9]
+; AVX512BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm16, %zmm14
 ; AVX512BW-FCP-NEXT:    movb $-32, %r11b
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm31 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm18, %zmm14
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm16
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm17, %zmm16
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k1}
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
-; AVX512BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm24, %zmm17
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
-; AVX512BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm16
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm20, %zmm16
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k2}
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
-; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm14
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm18, %zmm14
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm0[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm16, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6]
+; AVX512BW-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm22, %zmm5
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [9,0,7,0,9,0,7,0]
+; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm19, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k2}
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [11,4,11,4,11,4,11,4]
+; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm19, %zmm17
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
-; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm21, %zmm17
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm5, %zmm17
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10]
+; AVX512BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm20, %zmm17
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm22
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm24, %zmm22
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm17
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm19
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm21, %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512BW-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm17
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm19, %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm20, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
+; AVX512BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm21
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm22, %zmm21
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
-; AVX512BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm21
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm21
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,5,6,11,0,5,6,11]
 ; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm19
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm25, %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm19, %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm19 {%k1}
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm22
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm22
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm26, %zmm4, %zmm25
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm25, %zmm21
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [14,0,0,7,14,0,0,7]
+; AVX512BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm20
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm26, %zmm20
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm27 = [2,5]
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm28
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm29
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm30 = ymm28[8,9,10,11,12,13,14,15],ymm29[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm29[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm29, %ymm27, %ymm28
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm28, %zmm20, %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20 {%k1}
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm1, %zmm23
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm25, %zmm23
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 464(%rdi), %xmm5
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm25, %zmm21
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm21 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm24, %zmm5
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
-; AVX512BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm5
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
-; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm22
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm22
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm27
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm27, %xmm27
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm27, %zmm22, %zmm22
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm22 {%k1}
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm24
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm24
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm26, %zmm25
-; AVX512BW-FCP-NEXT:    vmovdqa 512(%rdi), %ymm5
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm25, %zmm23
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm18, %zmm5
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13]
-; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm26
+; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %ymm25
+; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm28
+; AVX512BW-FCP-NEXT:    vpermi2q %ymm25, %ymm28, %ymm27
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm27, %zmm26, %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm21 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm26
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm22, %zmm26
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,5,6,12,0,5,6,12]
+; AVX512BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm27, %zmm26
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0]
+; AVX512BW-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm23
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm29, %zmm23
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm30, %xmm30
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm30, %zmm23, %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm23 {%k1}
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm1, %zmm22
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm27, %zmm22
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm0, %zmm29
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm25 = ymm28[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm25, %xmm25
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm25, %zmm29, %zmm25
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm25 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm22
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm18, %zmm22
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [4,5,6,13,4,5,6,13]
+; AVX512BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm26, %zmm22
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm4
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [4,11]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm10
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,11]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm10
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm24
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm18
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm25, %zmm18
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm6, %zmm1, %zmm18
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm26, %zmm18
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm4
-; AVX512BW-FCP-NEXT:    vmovdqa 576(%rdi), %ymm5
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm28, %zmm2, %zmm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm25
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm20, %zmm10
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
-; AVX512BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm12, %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm18
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [5,12]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm29, %zmm4, %zmm7
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm10, %zmm7
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm20
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm12, %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqa 640(%rdi), %ymm10
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm28, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa 576(%rdi), %ymm10
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm28
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm7, %zmm11, %zmm0
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm27
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm19, %zmm4
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,14,4,5,6,14]
+; AVX512BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm18, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm26
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm26[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm26[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [5,12]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm7
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm6, %zmm19
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm18, %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqa 640(%rdi), %ymm5
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm10[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm28, %zmm11, %zmm0
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9]
 ; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm5, %zmm8
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
-; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm10, %zmm8
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm11, %zmm9
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm5, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm10, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [4,5,8,15,4,5,8,15]
+; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm7, %zmm8
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm10 = [0,0,4,11]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm10, %zmm9
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [6,13]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm13, %zmm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm5, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm7, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm13, %zmm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, 64(%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r10)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%r10)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, 64(%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r10)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -5014,186 +5050,189 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm24
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm29, %zmm18, %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm18, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    movb $24, %r11b
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k2
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm9[4,5,4,5],zmm24[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm16, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm17, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm5, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm16, %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    movb $-32, %r11b
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm31 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm18, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm17, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm24, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm20, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm18, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm0[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm16, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-BW-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm22, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm19, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm19, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm21, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm5, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm20, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm24, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm21, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-BW-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm19, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm20, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm22, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,5,6,11,0,5,6,11]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm25, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm19, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm19 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm26, %zmm4, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm25, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm26, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm27 = [2,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm28
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm29
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm30 = ymm28[8,9,10,11,12,13,14,15],ymm29[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm29[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm29, %ymm27, %ymm28
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm28, %zmm20, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm1, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm25, %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 464(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm25, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm21 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm24, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm27
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm27, %xmm27
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm27, %zmm22, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm26, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 512(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm25, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm18, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %ymm25
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm28
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm25, %ymm28, %ymm27
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm27, %zmm26, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm21 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm22, %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm27, %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm29, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm30, %xmm30
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm30, %zmm23, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm23 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm1, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm27, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm0, %zmm29
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm25 = ymm28[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm25, %xmm25
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm25, %zmm29, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm25 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm18, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm26, %zmm22
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [4,11]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,11]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm10
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm25, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm6, %zmm1, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm26, %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 576(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm28, %zmm2, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm20, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm12, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [5,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm29, %zmm4, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm10, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm0, %zmm6, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm12, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 640(%rdi), %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm28, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 576(%rdi), %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm28
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm7, %zmm11, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm19, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm18, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm26
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm26[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm26[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [5,12]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm6, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm18, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 640(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm10[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm28, %zmm11, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm5, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
-; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm10, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm11, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm5, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm10, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm7, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm10 = [0,0,4,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm10, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm13, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm5, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm7, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm13, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, 64(%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, 64(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, 64(%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <112 x i64>, ptr %in.vec, align 64
@@ -5776,411 +5815,412 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    subq $1736, %rsp # imm = 0x6C8
 ; AVX-NEXT:    vmovaps 1216(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 768(%rdi), %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 320(%rdi), %ymm7
+; AVX-NEXT:    vmovaps 768(%rdi), %ymm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 320(%rdi), %ymm10
 ; AVX-NEXT:    vinsertf128 $1, 384(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm0[6,7]
 ; AVX-NEXT:    vmovaps 224(%rdi), %xmm5
-; AVX-NEXT:    vmovaps 272(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovaps 272(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, 832(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7]
 ; AVX-NEXT:    vmovaps 672(%rdi), %xmm6
-; AVX-NEXT:    vmovaps 720(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovaps 720(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX-NEXT:    vmovaps 1120(%rdi), %xmm11
-; AVX-NEXT:    vmovaps 1168(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovaps 1168(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1664(%rdi), %ymm14
+; AVX-NEXT:    vmovaps 1664(%rdi), %ymm3
 ; AVX-NEXT:    vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm0[6,7]
 ; AVX-NEXT:    vmovaps 1568(%rdi), %xmm8
-; AVX-NEXT:    vmovaps 1616(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovaps 1616(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm0[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm2[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 96(%rdi), %ymm4
 ; AVX-NEXT:    vinsertf128 $1, 160(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3]
-; AVX-NEXT:    vmovapd 48(%rdi), %xmm1
-; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm2 = mem[0],xmm1[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovapd 48(%rdi), %xmm0
+; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = mem[0],xmm0[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 544(%rdi), %ymm9
 ; AVX-NEXT:    vinsertf128 $1, 608(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3]
-; AVX-NEXT:    vmovapd 448(%rdi), %xmm10
-; AVX-NEXT:    vmovapd 496(%rdi), %xmm1
-; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm2 = xmm10[0],xmm1[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovapd 448(%rdi), %xmm13
+; AVX-NEXT:    vmovapd 496(%rdi), %xmm0
+; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm13[0],xmm0[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 992(%rdi), %ymm15
 ; AVX-NEXT:    vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm15[0,1,2],ymm0[3]
 ; AVX-NEXT:    vmovapd 896(%rdi), %xmm12
-; AVX-NEXT:    vmovapd 944(%rdi), %xmm1
-; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm2 = xmm12[0],xmm1[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX-NEXT:    vmovapd 944(%rdi), %xmm0
+; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm12[0],xmm0[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1440(%rdi), %ymm2
+; AVX-NEXT:    vmovapd 1440(%rdi), %ymm14
 ; AVX-NEXT:    vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3]
-; AVX-NEXT:    vmovapd 1344(%rdi), %xmm3
+; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm14[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovapd 1344(%rdi), %xmm2
 ; AVX-NEXT:    vmovapd 1392(%rdi), %xmm0
 ; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm13 = xmm3[0],xmm0[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 288(%rdi), %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 384(%rdi), %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3]
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 736(%rdi), %xmm5
-; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 384(%rdi), %ymm5
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm10[0],ymm5[1],ymm10[3],ymm5[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 736(%rdi), %xmm0
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 832(%rdi), %ymm7
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[3],ymm7[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm1
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 1280(%rdi), %ymm6
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[3],ymm6[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 1632(%rdi), %xmm1
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 1728(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[1],ymm14[3],ymm11[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 160(%rdi), %ymm8
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[3],ymm8[2]
-; AVX-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 608(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[1],ymm9[3],ymm14[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm0
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 1280(%rdi), %ymm1
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[3],ymm1[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1632(%rdi), %xmm0
+; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 1728(%rdi), %ymm0
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[3],ymm0[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 160(%rdi), %ymm10
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm4[0],ymm10[1],ymm4[3],ymm10[2]
+; AVX-NEXT:    vmovdqa 64(%rdi), %xmm11
+; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 608(%rdi), %ymm3
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm9[0],ymm3[1],ymm9[3],ymm3[2]
 ; AVX-NEXT:    vmovdqa 512(%rdi), %xmm4
-; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1056(%rdi), %ymm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[1],ymm15[3],ymm13[2]
-; AVX-NEXT:    vmovdqa 960(%rdi), %xmm15
-; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1504(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm13[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3]
+; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 1056(%rdi), %ymm8
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm15[0],ymm8[1],ymm15[3],ymm8[2]
+; AVX-NEXT:    vmovdqa 960(%rdi), %xmm13
+; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3]
+; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 1504(%rdi), %ymm12
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm14[0],ymm12[1],ymm14[3],ymm12[2]
 ; AVX-NEXT:    vmovdqa 1408(%rdi), %xmm9
-; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 352(%rdi), %xmm3
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm2
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3]
-; AVX-NEXT:    vmovapd 240(%rdi), %xmm2
-; AVX-NEXT:    vblendpd {{.*#+}} xmm10 = xmm2[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 800(%rdi), %xmm0
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm10
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3]
-; AVX-NEXT:    vmovapd 688(%rdi), %xmm10
-; AVX-NEXT:    vblendpd {{.*#+}} xmm12 = xmm10[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3]
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1248(%rdi), %xmm7
-; AVX-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm12
-; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm6[3]
-; AVX-NEXT:    vmovapd 1136(%rdi), %xmm6
-; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm6[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3]
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1696(%rdi), %xmm7
-; AVX-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3]
-; AVX-NEXT:    vmovapd 1584(%rdi), %xmm11
-; AVX-NEXT:    vblendpd {{.*#+}} xmm12 = xmm11[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3]
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 128(%rdi), %xmm12
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm7
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3]
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm8
-; AVX-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 576(%rdi), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3]
-; AVX-NEXT:    vmovdqa 464(%rdi), %xmm8
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm1
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm13[3]
-; AVX-NEXT:    vmovdqa 912(%rdi), %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm15[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 1472(%rdi), %xmm4
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm7
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3]
-; AVX-NEXT:    vmovdqa 1360(%rdi), %xmm15
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm9[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 304(%rdi), %xmm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[3]
-; AVX-NEXT:    vmovdqa 416(%rdi), %xmm13
-; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
+; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 752(%rdi), %xmm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[3]
-; AVX-NEXT:    vmovdqa 864(%rdi), %xmm10
-; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1200(%rdi), %xmm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[3]
-; AVX-NEXT:    vmovdqa 1312(%rdi), %xmm6
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps 352(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1648(%rdi), %xmm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[3]
-; AVX-NEXT:    vmovdqa 1760(%rdi), %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3]
+; AVX-NEXT:    vmovapd 240(%rdi), %xmm5
+; AVX-NEXT:    vblendpd {{.*#+}} xmm6 = xmm5[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 800(%rdi), %xmm2
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3]
+; AVX-NEXT:    vmovapd 688(%rdi), %xmm7
+; AVX-NEXT:    vblendpd {{.*#+}} xmm9 = xmm7[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1248(%rdi), %xmm9
+; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm2
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3]
+; AVX-NEXT:    vmovapd 1136(%rdi), %xmm14
+; AVX-NEXT:    vblendpd {{.*#+}} xmm2 = xmm14[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1696(%rdi), %xmm15
+; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm1
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovapd 1584(%rdi), %xmm2
+; AVX-NEXT:    vblendpd {{.*#+}} xmm1 = xmm2[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1424(%rdi), %xmm0
+; AVX-NEXT:    vmovaps 128(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3]
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[3]
-; AVX-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovdqa 576(%rdi), %xmm10
+; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm0
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3]
+; AVX-NEXT:    vmovdqa 464(%rdi), %xmm11
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm4[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 976(%rdi), %xmm15
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[3]
-; AVX-NEXT:    vmovdqa 1088(%rdi), %xmm1
-; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovdqa 1024(%rdi), %xmm6
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3]
+; AVX-NEXT:    vmovdqa 912(%rdi), %xmm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm13[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 528(%rdi), %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[3]
-; AVX-NEXT:    vmovdqa 640(%rdi), %xmm4
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 80(%rdi), %xmm3
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[3]
-; AVX-NEXT:    vmovdqa 192(%rdi), %xmm7
-; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa 1472(%rdi), %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm4
+; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm4[0,1,2],ymm12[3]
+; AVX-NEXT:    vmovapd 1360(%rdi), %xmm4
+; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm12 = xmm4[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3]
+; AVX-NEXT:    vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 416(%rdi), %xmm13
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 304(%rdi), %xmm12
+; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 864(%rdi), %xmm8
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX-NEXT:    vmovdqa 752(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1312(%rdi), %xmm5
+; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT:    vmovapd 128(%rdi), %ymm2
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3]
-; AVX-NEXT:    vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3]
+; AVX-NEXT:    vmovdqa 1200(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1760(%rdi), %xmm7
+; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm15[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
+; AVX-NEXT:    vmovdqa 1648(%rdi), %xmm9
+; AVX-NEXT:    vmovdqa %xmm9, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1536(%rdi), %xmm9
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovdqa 1424(%rdi), %xmm7
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1088(%rdi), %xmm0
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
+; AVX-NEXT:    vmovdqa 976(%rdi), %xmm1
+; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 640(%rdi), %xmm2
+; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT:    vmovdqa 528(%rdi), %xmm4
+; AVX-NEXT:    vpalignr {{.*#+}} xmm10 = xmm11[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 192(%rdi), %xmm3
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm14
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT:    vmovapd 128(%rdi), %ymm11
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = mem[0,1,2,3],xmm14[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm3
-; AVX-NEXT:    vmovapd 352(%rdi), %ymm7
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3]
-; AVX-NEXT:    vmovapd 256(%rdi), %xmm3
-; AVX-NEXT:    vblendpd {{.*#+}} xmm8 = xmm3[0],xmm9[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3]
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vmovapd 576(%rdi), %ymm7
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3]
-; AVX-NEXT:    vmovapd 480(%rdi), %xmm7
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
+; AVX-NEXT:    vmovapd 352(%rdi), %ymm10
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3]
+; AVX-NEXT:    vmovdqa 256(%rdi), %xmm6
+; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm12[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT:    vmovaps 576(%rdi), %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT:    vmovdqa 480(%rdi), %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm3
+; AVX-NEXT:    vmovapd 800(%rdi), %ymm8
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3]
+; AVX-NEXT:    vmovapd 704(%rdi), %xmm15
+; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm15[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps 1024(%rdi), %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovdqa 928(%rdi), %xmm13
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm1[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX-NEXT:    vmovapd 1248(%rdi), %ymm14
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovapd 1152(%rdi), %xmm6
+; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm6[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm4
-; AVX-NEXT:    vmovapd 800(%rdi), %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3]
-; AVX-NEXT:    vmovapd 704(%rdi), %xmm14
-; AVX-NEXT:    vblendpd {{.*#+}} xmm5 = xmm14[0],xmm5[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vmovapd 1024(%rdi), %ymm4
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3]
-; AVX-NEXT:    vmovapd 928(%rdi), %xmm9
-; AVX-NEXT:    vblendpd {{.*#+}} xmm4 = xmm9[0],xmm15[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps 1472(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovdqa 1376(%rdi), %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm7[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
+; AVX-NEXT:    vmovapd 1696(%rdi), %ymm5
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3]
+; AVX-NEXT:    vmovapd 1600(%rdi), %xmm7
+; AVX-NEXT:    vblendpd $2, (%rsp), %xmm7, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = xmm7[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm4
-; AVX-NEXT:    vmovapd 1248(%rdi), %ymm1
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3]
-; AVX-NEXT:    vmovapd 1152(%rdi), %xmm15
-; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm15[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm4
-; AVX-NEXT:    vmovaps 1472(%rdi), %ymm5
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX-NEXT:    vmovaps 1376(%rdi), %xmm8
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm8[0,1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX-NEXT:    vmovapd 1696(%rdi), %ymm12
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3]
-; AVX-NEXT:    vmovapd 1600(%rdi), %xmm13
-; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm13[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 192(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[3],ymm11[2]
-; AVX-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 416(%rdi), %ymm10
+; AVX-NEXT:    vmovapd 192(%rdi), %ymm9
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[3],ymm9[2]
+; AVX-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
+; AVX-NEXT:    vmovupd %ymm1, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vmovapd 416(%rdi), %ymm11
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[3],ymm11[2]
+; AVX-NEXT:    vmovdqa 320(%rdi), %xmm12
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 544(%rdi), %xmm1
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 640(%rdi), %ymm4
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[3],ymm10[2]
-; AVX-NEXT:    vmovdqa 320(%rdi), %xmm6
-; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovupd %ymm2, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 544(%rdi), %xmm2
-; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 640(%rdi), %ymm5
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 864(%rdi), %ymm7
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[3],ymm7[2]
-; AVX-NEXT:    vmovdqa 768(%rdi), %xmm4
-; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 992(%rdi), %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[3],ymm4[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 864(%rdi), %ymm10
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm8[0],ymm10[1],ymm8[3],ymm10[2]
+; AVX-NEXT:    vmovdqa 768(%rdi), %xmm8
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm15[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 992(%rdi), %xmm1
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm3
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[3],ymm3[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1312(%rdi), %ymm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[1],ymm1[3],ymm9[2]
-; AVX-NEXT:    vmovdqa 1216(%rdi), %xmm2
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovdqa 1440(%rdi), %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 1536(%rdi), %ymm8
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[3],ymm8[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1760(%rdi), %ymm1
-; AVX-NEXT:    vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[3],ymm1[2]
-; AVX-NEXT:    vmovdqa 1664(%rdi), %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3]
-; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm11 = mem[0,1,2],ymm11[3]
-; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT:    vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm14 = ymm13[0,1],ymm11[2,3]
-; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm10 = mem[0,1,2],ymm10[3]
-; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm6[0,1],ymm10[2,3]
-; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = mem[0,1,2],ymm5[3]
-; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3]
-; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = mem[0,1,2],ymm7[3]
-; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 1312(%rdi), %ymm2
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm14[0],ymm2[1],ymm14[3],ymm2[2]
+; AVX-NEXT:    vmovdqa 1216(%rdi), %xmm14
+; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm13 = ymm6[0,1],ymm1[2,3]
+; AVX-NEXT:    vmovdqa 1440(%rdi), %xmm1
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 1536(%rdi), %ymm1
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[3],ymm1[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3]
+; AVX-NEXT:    vmovapd 1760(%rdi), %ymm0
+; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm5[0],ymm0[1],ymm5[3],ymm0[2]
+; AVX-NEXT:    vmovdqa 1664(%rdi), %xmm5
+; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3]
+; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm9 = mem[0,1,2],ymm9[3]
+; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT:    vblendpd {{.*#+}} xmm15 = xmm15[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm9[2,3]
+; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm9 = mem[0,1,2],ymm11[3]
+; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = mem[0,1,2,3],xmm12[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm11[0,1],ymm9[2,3]
+; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm4 = mem[0,1,2],ymm4[3]
+; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vblendpd {{.*#+}} xmm9 = xmm9[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3]
+; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm9 = mem[0,1,2],ymm10[3]
+; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = mem[0,1,2,3],xmm8[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm8[0,1],ymm9[2,3]
 ; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm3 = mem[0,1,2],ymm3[3]
-; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
-; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm4 = mem[0,1,2],ymm9[3]
-; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm2[0,1],ymm4[2,3]
-; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = mem[0,1,2],ymm8[3]
-; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vblendpd {{.*#+}} xmm8 = xmm8[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm10 = ymm8[0,1],ymm3[2,3]
+; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm2 = mem[0,1,2],ymm2[3]
+; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = mem[0,1,2,3],xmm14[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
 ; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm1 = mem[0,1,2],ymm1[3]
-; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3]
+; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm1[2,3]
+; AVX-NEXT:    vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = mem[0,1,2],ymm0[3]
+; AVX-NEXT:    vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[0,1,2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, 192(%rsi)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -6262,29 +6302,28 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, (%r9)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovapd %ymm12, 224(%rax)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 192(%rax)
-; AVX-NEXT:    vmovapd %ymm15, 160(%rax)
+; AVX-NEXT:    vmovapd %ymm7, 224(%rax)
+; AVX-NEXT:    vmovapd %ymm6, 192(%rax)
+; AVX-NEXT:    vmovapd %ymm13, 160(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, 128(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, 64(%rax)
-; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
+; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovapd %ymm10, 224(%rax)
-; AVX-NEXT:    vmovapd %ymm4, 192(%rax)
-; AVX-NEXT:    vmovapd %ymm7, 160(%rax)
-; AVX-NEXT:    vmovapd %ymm3, 128(%rax)
-; AVX-NEXT:    vmovapd %ymm5, 96(%rax)
-; AVX-NEXT:    vmovapd %ymm6, 64(%rax)
-; AVX-NEXT:    vmovapd %ymm11, 32(%rax)
-; AVX-NEXT:    vmovapd %ymm14, (%rax)
+; AVX-NEXT:    vmovapd %ymm9, 224(%rax)
+; AVX-NEXT:    vmovapd %ymm8, 192(%rax)
+; AVX-NEXT:    vmovapd %ymm2, 160(%rax)
+; AVX-NEXT:    vmovapd %ymm10, 128(%rax)
+; AVX-NEXT:    vmovapd %ymm11, 96(%rax)
+; AVX-NEXT:    vmovapd %ymm4, 64(%rax)
+; AVX-NEXT:    vmovapd %ymm12, 32(%rax)
+; AVX-NEXT:    vmovapd %ymm15, (%rax)
 ; AVX-NEXT:    addq $1736, %rsp # imm = 0x6C8
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -8254,465 +8293,449 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-FCP-LABEL: load_i64_stride7_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $2728, %rsp # imm = 0xAA8
-; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm25
-; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm20
-; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm14
-; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm26
+; AVX512-FCP-NEXT:    subq $3048, %rsp # imm = 0xBE8
+; AVX512-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm18
+; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm17
+; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm21
+; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm8
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm22
+; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm28
 ; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm19
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm23
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm6
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0]
-; AVX512-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm30, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
-; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm25
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm15
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512-FCP-NEXT:    vmovdqa 960(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 896(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %ymm1, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 448(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %ymm1, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm13
+; AVX512-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %ymm2, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpermi2q %ymm1, %ymm2, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm26, %zmm1
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
+; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm4, %zmm1
+; AVX512-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [4,11]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm3, %zmm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm26, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm4, %zmm5
+; AVX512-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm3, %zmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm25, %zmm3, %zmm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm8, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm26, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm4, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm25, %zmm26, %zmm15
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm26, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm26, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm30, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm8, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
-; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4]
-; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm15, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm15, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm22, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm22, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm11, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm30, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm30, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm26, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm29
+; AVX512-FCP-NEXT:    vpermi2q %zmm8, %zmm29, %zmm26
+; AVX512-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm16
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm4, %zmm26
+; AVX512-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpermi2q %zmm7, %zmm9, %zmm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm26, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm9, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [5,12]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [9,0,7,0,9,0,7,0]
+; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm15, %zmm4
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
-; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm20
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm21
-; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm9, %zmm21
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm0, %zmm19
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm30, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vpermt2q %zmm25, %zmm8, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm28
-; AVX512-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm3, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm30, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm10
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm10, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm18
-; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm9, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm25, %zmm15, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm15
-; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm11, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm26
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm16
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm11, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm9, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm11
-; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm5, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm27
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm9, %zmm27
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm10, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm25, %zmm22, %zmm29
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm10, %zmm22
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm14
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm22
-; AVX512-FCP-NEXT:    vpermt2q %zmm25, %zmm5, %zmm22
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm10, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm25
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm10, %zmm30
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm24
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm24
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512-FCP-NEXT:    vpermi2q %zmm10, %zmm2, %zmm9
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm7
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm10
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
-; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm5
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa 640(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm9
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
-; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm17
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
-; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm29
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm5, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm26
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm20
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm20
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [11,4,11,4,11,4,11,4]
+; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm17
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5]
+; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
-; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm22
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [13,6,13,6,13,6,13,6]
+; AVX512-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm15, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm13
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm11
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm9, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm18
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm31, %zmm18
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm15, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm1, %zmm13
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
-; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm25
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm8
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm14
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm30
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
-; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm24
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm7
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
-; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm19
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm10
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm28
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm5, %zmm28
+; AVX512-FCP-NEXT:    vpermi2q %zmm8, %zmm29, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm24
+; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm31, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm15, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm19
+; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm27
+; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm31, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm15, %zmm26
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm31, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm25 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm22
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm31, %zmm22
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm30 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpermi2q %zmm8, %zmm29, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm14
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm14
+; AVX512-FCP-NEXT:    vpermi2q %zmm8, %zmm29, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm22
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512-FCP-NEXT:    vpermi2q %zmm8, %zmm29, %zmm31
+; AVX512-FCP-NEXT:    vpermi2q %zmm29, %zmm8, %zmm15
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm29
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm21
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9]
+; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    movb $24, %al
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm20
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0]
-; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm20
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm21, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm28
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm24
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,10,0,5,6,10]
+; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm19
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm17
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm8
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,11,0,5,6,11]
+; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm14
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm22
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,12,0,5,6,12]
+; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm15
-; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm15
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm19
-; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm18
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm3, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm3, %zmm11
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm3, %zmm15
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,8,15,4,5,8,15]
+; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm17
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm18
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm29
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm21
+; AVX512-FCP-NEXT:    movb $24, %al
+; AVX512-FCP-NEXT:    kmovw %eax, %k1
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm10[4,5,4,5],zmm27[4,5,4,5]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [14,0,0,7,14,0,0,7]
+; AVX512-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm16, %zmm23
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [7,0,9,0,7,0,9,0]
+; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm9, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm0, %zmm10
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm12[4,5,4,5],zmm5[4,5,4,5]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm22
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm22
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm24
+; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm9, %zmm24
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm12
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm7[4,5,4,5],zmm6[4,5,4,5]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
-; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm0, %zmm16
-; AVX512-FCP-NEXT:    vpermi2q %zmm11, %zmm3, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm17
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm17
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm11, %zmm4
-; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm21, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
-; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm21, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 912(%rdi), %xmm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm16, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 464(%rdi), %xmm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm16, %zmm27
+; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm16
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm20
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm20
+; AVX512-FCP-NEXT:    vpermi2q %zmm7, %zmm6, %zmm9
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm7
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm15, %zmm21
-; AVX512-FCP-NEXT:    vmovdqa 1360(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm0, %zmm16
-; AVX512-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,11]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm12
+; AVX512-FCP-NEXT:    vinserti64x4 $0, (%rsp), %zmm4, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [6,13]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm0, %zmm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm25, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [5,12]
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm4, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm15
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm8, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm14, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermi2q %zmm28, %zmm8, %zmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [5,12]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm7
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm30 = [6,13]
-; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm30, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm30, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 960(%rdi), %ymm15
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512-FCP-NEXT:    vpermi2q %zmm28, %zmm8, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm30, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm30, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm15
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm5
+; AVX512-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm21, %zmm1
 ; AVX512-FCP-NEXT:    movb $-32, %al
 ; AVX512-FCP-NEXT:    kmovw %eax, %k2
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm30 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm5 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm21 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k2}
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm15, %zmm17, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm20, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm12 {%k2}
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm14
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm14, %zmm19, %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm28 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm26 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm26 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm9 {%k2}
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 640(%rdi), %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm15
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm15, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k2}
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k2}
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k2}
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm4 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k2}
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm14
-; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm14
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinsertf64x4 $0, %ymm11, %zmm6, %zmm11
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinsertf64x4 $0, %ymm8, %zmm6, %zmm8
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm10, %zmm6
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinsertf64x4 $0, %ymm7, %zmm10, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 192(%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, 64(%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, 192(%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm20, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm11 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm2, %zmm13
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm13 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm12, %xmm16
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm12, %xmm17
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm16, %zmm24, %zmm16
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm16 {%k2}
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm9, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm9 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 192(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, 64(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, 192(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, 64(%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, 128(%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 192(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, 64(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 192(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 64(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 128(%r8)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm4, 192(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 128(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 192(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 64(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 128(%r8)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 192(%r9)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm4, (%r9)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -8720,16 +8743,20 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm4, 128(%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovaps %zmm7, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512-FCP-NEXT:    vmovaps %zmm8, (%rax)
-; AVX512-FCP-NEXT:    vmovaps %zmm11, 64(%rax)
-; AVX512-FCP-NEXT:    addq $2728, %rsp # imm = 0xAA8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rax)
+; AVX512-FCP-NEXT:    addq $3048, %rsp # imm = 0xBE8
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -9216,465 +9243,449 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $2728, %rsp # imm = 0xAA8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm26
+; AVX512DQ-FCP-NEXT:    subq $3048, %rsp # imm = 0xBE8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm6
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm8, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm8, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm15, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm15, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-FCP-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm11, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm9, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
-; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm21
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm9, %zmm21
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm0, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm30, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm25, %zmm8, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm30, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm15
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa 960(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 896(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm1, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 448(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm1, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm2, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpermi2q %ymm1, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm26, %zmm1
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [4,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm26, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm3, %zmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm25, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm26, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm4, %zmm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm25, %zmm26, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm26, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm10
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm10, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm26, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm26, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm9, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm25, %zmm15, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm26
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm9, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm5, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm27
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm9, %zmm27
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm10, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm25, %zmm22, %zmm29
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm10, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm22
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm25, %zmm5, %zmm22
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm10, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm25
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm10, %zmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm24
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm24
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm10, %zmm2, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm7
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm29
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm8, %zmm29, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm16
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm4, %zmm26
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm7, %zmm9, %zmm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm26, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [5,12]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm15, %zmm4
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 640(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm29
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm20
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm20
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm15, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm13
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm11
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm9, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm31, %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm15, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm1, %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm14
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm30
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm24
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
-; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm10
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm5, %zmm28
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm8, %zmm29, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm31, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm15, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm31, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm15, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm31, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm25 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm22
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm31, %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm30 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm8, %zmm29, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm14
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm14
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm8, %zmm29, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm8, %zmm29, %zmm31
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm29, %zmm8, %zmm15
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm29
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm21
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    movb $24, %al
-; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm20
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm21, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm28
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm15
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm19
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm3, %zmm11
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm3, %zmm15
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm29
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm21
+; AVX512DQ-FCP-NEXT:    movb $24, %al
+; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm10[4,5,4,5],zmm27[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm23
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm16, %zmm23
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm9, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm0, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm12[4,5,4,5],zmm5[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm22
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm24
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm9, %zmm24
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm7[4,5,4,5],zmm6[4,5,4,5]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm11, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm17
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm17
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm21, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm21, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 912(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm16, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 464(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm16, %zmm27
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm7, %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm20
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm20
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm7, %zmm6, %zmm9
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm15, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa 1360(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,11]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, (%rsp), %zmm4, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [6,13]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm25, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [5,12]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm8, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm14, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm28, %zmm8, %zmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [5,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm30 = [6,13]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm30, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 960(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm28, %zmm8, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm30, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm15
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm5
+; AVX512DQ-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm21, %zmm1
 ; AVX512DQ-FCP-NEXT:    movb $-32, %al
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm30 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm5 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm21 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k2}
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm15, %zmm17, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm20, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm12 {%k2}
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm14, %zmm19, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm28 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm9 {%k2}
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 640(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm15, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm26 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm26 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k2}
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k2}
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k2}
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm4 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k2}
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinsertf64x4 $0, %ymm11, %zmm6, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinsertf64x4 $0, %ymm8, %zmm6, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm10, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinsertf64x4 $0, %ymm7, %zmm10, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 192(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, 64(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, 192(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm20, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm11 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm2, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm13 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm12, %xmm16
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm12, %xmm17
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm16, %zmm24, %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm16 {%k2}
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm9 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, 192(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, 64(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, 192(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, 64(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, 128(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 192(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, 64(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 192(%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 64(%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 128(%r8)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm4, 192(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 128(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 192(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 64(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, 128(%r8)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 192(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm4, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -9682,16 +9693,20 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm4, 128(%r9)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm7, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm8, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm11, 64(%rax)
-; AVX512DQ-FCP-NEXT:    addq $2728, %rsp # imm = 0xAA8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rax)
+; AVX512DQ-FCP-NEXT:    addq $3048, %rsp # imm = 0xBE8
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -10170,474 +10185,481 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512BW-FCP-LABEL: load_i64_stride7_vf32:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    subq $2760, %rsp # imm = 0xAC8
-; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm29
-; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm30
-; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm12
+; AVX512BW-FCP-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm30
+; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm26
-; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm15
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,7,14,0,0,7,14,0]
+; AVX512BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm13, %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
-; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
-; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm8, %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm25, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
-; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
-; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
-; AVX512BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm23, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm23, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm11, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm25, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm9, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
-; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm0, %zmm30
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm10
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm9, %zmm10
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm0, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm29, %zmm25, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm8, %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm24
-; AVX512BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm15
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm3
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3]
+; AVX512BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm14, %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm16
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm13, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm14, %zmm11
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [9,0,7,0,9,0,7,0]
+; AVX512BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4]
+; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm8, %zmm5
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm8, %zmm5
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm29
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm7, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm18
-; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm11, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm9, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [12,5,12,5,12,5,12,5]
+; AVX512BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm24, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm11, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm6, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm13, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm11, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm13, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm22
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm9, %zmm22
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm7
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm29
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm23, %zmm29
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm23
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm31
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm11, %zmm31
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm15
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm25, %zmm15
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm25
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm21
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm9, %zmm21
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm16, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm0, %zmm16
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm0, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
-; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm5
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm6
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
-; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm3, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm17
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm4, %zmm17
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm1, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm13, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm14, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm28
+; AVX512BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm13, %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm18
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm18, %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
-; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm29
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm23
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
-; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm31
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm11
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm8, %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm27
+; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm6, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm30 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm6, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm26
+; AVX512BW-FCP-NEXT:    vpermt2q (%rsp), %zmm4, %zmm26 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm25
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm4, %zmm25
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm18, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm19
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm24, %zmm19
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm18, %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm7
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm6, %zmm3
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm18, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm10
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm13, %zmm10
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm18, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm4, %zmm15
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm18, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm18
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
+; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm11
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13]
-; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm15
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm25
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14]
-; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm21
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
-; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm30
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm16
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm7
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm11
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
+; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    movb $24, %al
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
-; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm27
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm27
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
+; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm19
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm18
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm23
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm10, %zmm23
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm19
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm0, %zmm19
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm12, %zmm4, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm20
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm20
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm12, %zmm10
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 912(%rdi), %xmm5
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm19, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
+; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
+; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
+; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
+; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm18
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm22
+; AVX512BW-FCP-NEXT:    movb $24, %al
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm27[4,5,4,5]
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 464(%rdi), %xmm5
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm3, %zmm1
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7]
+; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm11, %zmm21
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm29
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm29
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [0,0,4,11]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm7, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[4,5,4,5],zmm8[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm11, %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm24
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm24
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm7, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm12[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm27
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm11, %zmm27
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm12, %zmm1, %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm23
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm23
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm12, %zmm16
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm12, %zmm7, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm3, %zmm18, %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqa 1360(%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm3, %zmm0, %zmm18
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm0[4,5,4,5],zmm20[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm7, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm13
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [4,11]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm3, %zmm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [4,11]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm7, %zmm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
 ; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm3, %zmm11
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm3, %zmm11
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm24, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm28, %zmm9, %zmm7
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [5,12]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm11
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm30 = [6,13]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm30, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm4
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm7, %zmm0
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm24, %zmm2, %zmm7
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm30, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm30, %zmm9
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm10 = [5,12]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm7
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm10, %zmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [6,13]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm7
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm10, %zmm0
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm28, %zmm9, %zmm10
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm4, %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %ymm24
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm21, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm4, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm1
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    movb $-32, %al
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm14
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm30
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm5 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm28 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm30 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm30 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqa 960(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm20, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa 512(%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
+; AVX512BW-FCP-NEXT:    vmovdqa 896(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [2,5]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm1, %ymm13, %ymm6
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm27, %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm23, %zmm15
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %ymm17
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm10, %zmm21
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %ymm17
-; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm13
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm12
-; AVX512BW-FCP-NEXT:    vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinsertf64x4 $0, %ymm11, %zmm7, %zmm11
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinsertf64x4 $0, %ymm9, %zmm7, %zmm9
-; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm16, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm6 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa 512(%rdi), %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa 448(%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm19 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm1, %ymm13, %ymm9
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm9, %zmm21, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm15
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm21 = ymm9[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm15, %ymm13, %ymm9
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm9, %zmm17, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm9 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm14
+; AVX512BW-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm15
+; AVX512BW-FCP-NEXT:    vpermi2q %ymm14, %ymm15, %ymm13
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm11, %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm17 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm23, %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm11 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm12
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm29, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm21, %xmm13
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm24, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm13 {%k2}
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm15[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm14, %zmm16, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa 640(%rdi), %ymm14
+; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm14
+; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm2, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm14
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vinsertf64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, 192(%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, 64(%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 192(%rdx)
+; AVX512BW-FCP-NEXT:    vmovups (%rsp), %zmm10 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm14 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinsertf64x4 $0, %ymm14, %zmm10, %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm15 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm15, %zmm18, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm15, %zmm22, %zmm15
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rsi)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, 192(%rdx)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, 64(%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, 128(%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 192(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, 64(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, 192(%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 64(%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 128(%r8)
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovaps %zmm1, 192(%r9)
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovaps %zmm1, (%r9)
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovaps %zmm1, 64(%r9)
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovaps %zmm1, 128(%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, 64(%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, 128(%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 192(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 192(%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, 64(%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, 128(%r8)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 128(%r9)
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovaps %zmm8, 128(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512BW-FCP-NEXT:    vmovaps %zmm9, (%rax)
-; AVX512BW-FCP-NEXT:    vmovaps %zmm11, 64(%rax)
-; AVX512BW-FCP-NEXT:    addq $2760, %rsp # imm = 0xAC8
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, 128(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512BW-FCP-NEXT:    vmovaps %zmm14, (%rax)
+; AVX512BW-FCP-NEXT:    vmovaps %zmm8, 64(%rax)
+; AVX512BW-FCP-NEXT:    addq $2632, %rsp # imm = 0xA48
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -11116,474 +11138,481 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf32:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    subq $2760, %rsp # imm = 0xAC8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm30
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm12
+; AVX512DQ-BW-FCP-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm26
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm13, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm8, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm25, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm23, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm23, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm11, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-BW-FCP-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm14, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm25, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm13, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm9, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
-; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm0, %zmm30
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm9, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm0, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm29, %zmm25, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm8, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm14, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm8, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm8, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm7, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm11, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm9, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm24, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm11, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm6, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm13, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm11, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm13, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm9, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm23, %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm11, %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm25, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm9, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm16, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm0, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm0, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm3, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm4, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm1, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm13, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm14, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm28
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm13, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm18, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm8, %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm6, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm30 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm6, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q (%rsp), %zmm4, %zmm26 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm4, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm18, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm24, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm18, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm6, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm18, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm13, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm18, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm4, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
-; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm30
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    movb $24, %al
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm27
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm27
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm10, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm0, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm12, %zmm4, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm12, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 912(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm19, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    movb $24, %al
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm27[4,5,4,5]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 464(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm11, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm29
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm29
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm7 = [0,0,4,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[4,5,4,5],zmm8[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm11, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm12[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm11, %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm12, %zmm1, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm12, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm7, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm3, %zmm18, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1360(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm3, %zmm0, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm0[4,5,4,5],zmm20[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm13
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [4,11]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm3, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [4,11]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm7, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm3, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm3, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm24, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm28, %zmm9, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [5,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm30 = [6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm30, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm7, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm24, %zmm2, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm30, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm30, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm10 = [5,12]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm10, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm10, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm28, %zmm9, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm4, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %ymm24
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm21, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm4, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    movb $-32, %al
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm30
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm5 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm28 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm30 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm30 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 960(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm20, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 512(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 896(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [2,5]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm1, %ymm13, %ymm6
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm27, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm23, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm10, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinsertf64x4 $0, %ymm11, %zmm7, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinsertf64x4 $0, %ymm9, %zmm7, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm16, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm6 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 512(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 448(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm19 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm1, %ymm13, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm9, %zmm21, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm21 = ymm9[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm15, %ymm13, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm9, %zmm17, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm9 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm14, %ymm15, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm11, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm17 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm23, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm11 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm29, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm21, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm24, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm13 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm15[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm14, %zmm16, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 640(%rdi), %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm2, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vinsertf64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, 192(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, 64(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 192(%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovups (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm14 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinsertf64x4 $0, %ymm14, %zmm10, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm15 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm15, %zmm18, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm15, %zmm22, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, 192(%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, 64(%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, 128(%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 192(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, 64(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, 192(%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 64(%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 128(%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm1, 192(%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm1, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm1, 64(%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm1, 128(%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, 64(%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, 128(%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 192(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 192(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, 64(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, 128(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 128(%r9)
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm8, 128(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm9, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm11, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    addq $2760, %rsp # imm = 0xAC8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, 128(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm14, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm8, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    addq $2632, %rsp # imm = 0xA48
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <224 x i64>, ptr %in.vec, align 64
@@ -12740,52 +12769,52 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i64_stride7_vf64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $4232, %rsp # imm = 0x1088
-; AVX-NEXT:    vmovaps 1216(%rdi), %ymm3
-; AVX-NEXT:    vmovaps 768(%rdi), %ymm4
-; AVX-NEXT:    vmovaps 320(%rdi), %ymm5
+; AVX-NEXT:    subq $4248, %rsp # imm = 0x1098
+; AVX-NEXT:    vmovaps 1216(%rdi), %ymm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 768(%rdi), %ymm6
+; AVX-NEXT:    vmovaps 320(%rdi), %ymm7
 ; AVX-NEXT:    vinsertf128 $1, 384(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT:    vmovaps 224(%rdi), %xmm10
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 224(%rdi), %xmm12
 ; AVX-NEXT:    vmovaps 272(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, 832(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT:    vmovaps 672(%rdi), %xmm11
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 672(%rdi), %xmm13
 ; AVX-NEXT:    vmovaps 720(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT:    vmovaps 1120(%rdi), %xmm12
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 1120(%rdi), %xmm14
 ; AVX-NEXT:    vmovaps 1168(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1664(%rdi), %ymm6
+; AVX-NEXT:    vmovaps 1664(%rdi), %ymm8
 ; AVX-NEXT:    vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 1568(%rdi), %xmm15
+; AVX-NEXT:    vmovaps 1616(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 2112(%rdi), %ymm9
+; AVX-NEXT:    vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3]
-; AVX-NEXT:    vmovapd 1568(%rdi), %xmm2
-; AVX-NEXT:    vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovapd 1616(%rdi), %xmm1
-; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 2112(%rdi), %ymm7
-; AVX-NEXT:    vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3]
 ; AVX-NEXT:    vmovapd 2016(%rdi), %xmm2
 ; AVX-NEXT:    vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovapd 2064(%rdi), %xmm1
@@ -12793,10 +12822,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 2560(%rdi), %ymm8
+; AVX-NEXT:    vmovapd 2560(%rdi), %ymm10
 ; AVX-NEXT:    vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3]
 ; AVX-NEXT:    vmovapd 2464(%rdi), %xmm2
 ; AVX-NEXT:    vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovapd 2512(%rdi), %xmm1
@@ -12804,221 +12833,219 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 3008(%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 3008(%rdi), %ymm11
 ; AVX-NEXT:    vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT:    vmovaps 2912(%rdi), %xmm0
-; AVX-NEXT:    vmovaps 2960(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 3456(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 3520(%rdi), %ymm0, %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 3360(%rdi), %xmm15
-; AVX-NEXT:    vmovaps 3408(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 160(%rdi), %ymm0, %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 48(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 544(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 608(%rdi), %ymm0, %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 448(%rdi), %xmm9
-; AVX-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 496(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 992(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 1056(%rdi), %ymm0, %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 896(%rdi), %xmm9
-; AVX-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 944(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1440(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 1344(%rdi), %xmm9
-; AVX-NEXT:    vmovaps 1392(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 1888(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 1952(%rdi), %ymm0, %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 1792(%rdi), %xmm13
-; AVX-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 1840(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 2336(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 2400(%rdi), %ymm0, %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 2240(%rdi), %xmm13
-; AVX-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 2288(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 2784(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 2848(%rdi), %ymm0, %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 2688(%rdi), %xmm13
-; AVX-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 2736(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 3232(%rdi), %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, 3296(%rdi), %ymm0, %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 3136(%rdi), %xmm13
-; AVX-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps 3184(%rdi), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 2912(%rdi), %xmm2
+; AVX-NEXT:    vmovaps 2960(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 3456(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 288(%rdi), %xmm1
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 384(%rdi), %ymm2
+; AVX-NEXT:    vinsertf128 $1, 3520(%rdi), %ymm0, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 3360(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 3408(%rdi), %xmm3
+; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 160(%rdi), %ymm0, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 48(%rdi), %xmm3
+; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 544(%rdi), %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 608(%rdi), %ymm0, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 448(%rdi), %xmm4
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 496(%rdi), %xmm3
+; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 992(%rdi), %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 896(%rdi), %xmm4
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 944(%rdi), %xmm3
+; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 1440(%rdi), %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 1344(%rdi), %xmm5
+; AVX-NEXT:    vmovaps 1392(%rdi), %xmm4
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 1888(%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 1952(%rdi), %ymm0, %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT:    vmovaps 1792(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 1840(%rdi), %xmm4
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 2336(%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 2400(%rdi), %ymm0, %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT:    vmovaps 2240(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 2288(%rdi), %xmm4
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 2784(%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 2848(%rdi), %ymm0, %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT:    vmovaps 2688(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 2736(%rdi), %xmm4
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 3232(%rdi), %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, 3296(%rdi), %ymm0, %ymm3
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT:    vmovaps 3136(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps 3184(%rdi), %xmm4
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 288(%rdi), %xmm3
+; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm12[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 384(%rdi), %ymm0
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 736(%rdi), %xmm3
+; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm13[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 832(%rdi), %ymm0
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm0[1],ymm6[3],ymm0[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm3
+; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 1280(%rdi), %ymm4
+; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1632(%rdi), %xmm3
+; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm15[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 1728(%rdi), %ymm15
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm8[0],ymm15[1],ymm8[3],ymm15[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 2080(%rdi), %xmm3
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 2176(%rdi), %ymm14
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm9[0],ymm14[1],ymm9[3],ymm14[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 2528(%rdi), %xmm3
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 2624(%rdi), %ymm13
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm10[0],ymm13[1],ymm10[3],ymm13[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 2976(%rdi), %xmm3
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 3072(%rdi), %ymm12
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm11[0],ymm12[1],ymm11[3],ymm12[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[3],ymm2[2]
+; AVX-NEXT:    vmovdqa 3424(%rdi), %xmm2
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovapd 3520(%rdi), %ymm11
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[3],ymm11[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 736(%rdi), %xmm1
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 832(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[1],ymm4[3],ymm14[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovapd 160(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm1
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 1280(%rdi), %ymm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[3],ymm13[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2]
+; AVX-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 1632(%rdi), %xmm1
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 1728(%rdi), %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[1],ymm6[3],ymm12[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovapd 608(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 2080(%rdi), %xmm1
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 2176(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[1],ymm7[3],ymm11[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2]
+; AVX-NEXT:    vmovdqa 512(%rdi), %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 2528(%rdi), %xmm1
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 2624(%rdi), %ymm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[1],ymm8[3],ymm10[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovapd 1056(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 2976(%rdi), %xmm1
-; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 3072(%rdi), %ymm2
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 3424(%rdi), %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovapd 3520(%rdi), %ymm15
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[3],ymm15[2]
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2]
+; AVX-NEXT:    vmovdqa 960(%rdi), %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 1504(%rdi), %ymm1
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2]
+; AVX-NEXT:    vmovdqa 1408(%rdi), %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 160(%rdi), %ymm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
-; AVX-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 608(%rdi), %ymm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
-; AVX-NEXT:    vmovdqa 512(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1056(%rdi), %ymm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
-; AVX-NEXT:    vmovdqa 960(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1504(%rdi), %ymm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
-; AVX-NEXT:    vmovdqa 1408(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1952(%rdi), %ymm9
+; AVX-NEXT:    vmovapd 1952(%rdi), %ymm10
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[3],ymm9[2]
-; AVX-NEXT:    vmovdqa 1856(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[3],ymm10[2]
+; AVX-NEXT:    vmovdqa 1856(%rdi), %xmm9
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2400(%rdi), %ymm6
@@ -13048,75 +13075,76 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 352(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX-NEXT:    vmovaps 240(%rdi), %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT:    vmovaps 240(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm1[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 800(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3]
-; AVX-NEXT:    vmovapd 688(%rdi), %xmm7
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT:    vmovaps 688(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm1[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 1248(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3]
-; AVX-NEXT:    vmovapd 1136(%rdi), %xmm7
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT:    vmovaps 1136(%rdi), %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 1696(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3]
 ; AVX-NEXT:    vmovapd 1584(%rdi), %xmm7
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 2144(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3]
 ; AVX-NEXT:    vmovapd 2032(%rdi), %xmm7
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 2592(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3]
 ; AVX-NEXT:    vmovapd 2480(%rdi), %xmm7
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 3040(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX-NEXT:    vmovaps 2928(%rdi), %xmm7
-; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3]
+; AVX-NEXT:    vmovapd 2928(%rdi), %xmm7
+; AVX-NEXT:    vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 3488(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3]
-; AVX-NEXT:    vmovapd 3376(%rdi), %xmm1
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm1[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3]
+; AVX-NEXT:    vmovapd 3376(%rdi), %xmm7
+; AVX-NEXT:    vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 3264(%rdi), %xmm0
@@ -13124,7 +13152,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3]
 ; AVX-NEXT:    vmovdqa 3152(%rdi), %xmm2
-; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13133,347 +13161,346 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3]
 ; AVX-NEXT:    vmovdqa 2704(%rdi), %xmm1
-; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 2368(%rdi), %xmm14
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm0
+; AVX-NEXT:    vmovdqa 2368(%rdi), %xmm15
+; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm0
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3]
-; AVX-NEXT:    vmovdqa 2256(%rdi), %xmm10
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm8[4,5,6,7]
+; AVX-NEXT:    vmovdqa 2256(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 1920(%rdi), %xmm8
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm1
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3]
-; AVX-NEXT:    vmovapd 1808(%rdi), %xmm5
-; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = xmm5[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 1472(%rdi), %xmm7
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm1
-; AVX-NEXT:    vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[0,1,2],mem[3]
-; AVX-NEXT:    vmovapd 1360(%rdi), %xmm3
-; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = xmm3[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1920(%rdi), %xmm14
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm0
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3]
+; AVX-NEXT:    vmovdqa 1808(%rdi), %xmm6
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm9[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1472(%rdi), %xmm5
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT:    vmovaps 1360(%rdi), %xmm10
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm10[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1024(%rdi), %xmm4
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm1
-; AVX-NEXT:    vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = ymm1[0,1,2],mem[3]
-; AVX-NEXT:    vmovapd 912(%rdi), %xmm1
-; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = xmm1[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 576(%rdi), %xmm9
-; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm2
-; AVX-NEXT:    vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = ymm2[0,1,2],mem[3]
-; AVX-NEXT:    vmovapd 464(%rdi), %xmm6
-; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm11 = xmm6[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 128(%rdi), %xmm2
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm11
-; AVX-NEXT:    vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm11 = ymm11[0,1,2],mem[3]
-; AVX-NEXT:    vmovapd 16(%rdi), %xmm12
-; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = xmm12[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3]
-; AVX-NEXT:    vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 80(%rdi), %xmm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[3]
-; AVX-NEXT:    vmovdqa 192(%rdi), %xmm12
-; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 304(%rdi), %xmm2
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
-; AVX-NEXT:    vmovdqa 416(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 528(%rdi), %xmm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[3]
-; AVX-NEXT:    vmovdqa 640(%rdi), %xmm2
-; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3]
-; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 752(%rdi), %xmm9
-; AVX-NEXT:    vmovupd %ymm9, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[3]
-; AVX-NEXT:    vmovdqa 864(%rdi), %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT:    vmovaps 912(%rdi), %xmm11
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm11[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 576(%rdi), %xmm2
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT:    vmovaps 464(%rdi), %xmm3
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm3[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 128(%rdi), %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT:    vmovaps 16(%rdi), %xmm12
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm12[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 192(%rdi), %xmm7
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 416(%rdi), %xmm9
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX-NEXT:    vmovdqa 304(%rdi), %xmm12
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 640(%rdi), %xmm1
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT:    vmovdqa 528(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3]
-; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 976(%rdi), %xmm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[3]
-; AVX-NEXT:    vmovdqa 1088(%rdi), %xmm1
-; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 864(%rdi), %xmm2
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT:    vmovdqa 752(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1088(%rdi), %xmm3
+; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1200(%rdi), %xmm6
-; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[3]
-; AVX-NEXT:    vmovdqa 1312(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 976(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3]
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1424(%rdi), %xmm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[3]
-; AVX-NEXT:    vmovdqa 1536(%rdi), %xmm0
+; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1312(%rdi), %xmm4
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT:    vmovdqa 1200(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1648(%rdi), %xmm7
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3]
-; AVX-NEXT:    vmovdqa 1760(%rdi), %xmm0
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1536(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1872(%rdi), %xmm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[3]
-; AVX-NEXT:    vmovdqa 1984(%rdi), %xmm0
+; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX-NEXT:    vmovdqa 1424(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 2096(%rdi), %xmm3
-; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
-; AVX-NEXT:    vmovdqa 2208(%rdi), %xmm3
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 2320(%rdi), %xmm0
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[3]
-; AVX-NEXT:    vmovdqa 2432(%rdi), %xmm3
-; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 2544(%rdi), %xmm14
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[3]
-; AVX-NEXT:    vmovdqa 2656(%rdi), %xmm4
-; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 2768(%rdi), %xmm4
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[3]
-; AVX-NEXT:    vmovdqa 2880(%rdi), %xmm8
+; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1760(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 2992(%rdi), %xmm5
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
-; AVX-NEXT:    vmovdqa 3104(%rdi), %xmm5
-; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 3216(%rdi), %xmm5
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
+; AVX-NEXT:    vmovdqa 1648(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 1984(%rdi), %xmm5
+; AVX-NEXT:    vpalignr {{.*#+}} xmm10 = xmm14[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX-NEXT:    vmovdqa 1872(%rdi), %xmm14
+; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 2208(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT:    vmovdqa 2096(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 2432(%rdi), %xmm11
+; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm15[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT:    vmovdqa 2320(%rdi), %xmm15
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 2656(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT:    vmovdqa 2544(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 2880(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT:    vmovdqa 2768(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 3104(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT:    vmovdqa 2992(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 3328(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 3440(%rdi), %xmm5
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
-; AVX-NEXT:    vmovdqa 3552(%rdi), %xmm4
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3]
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
-; AVX-NEXT:    vmovapd 128(%rdi), %ymm6
+; AVX-NEXT:    vmovdqa 3216(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 3552(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT:    vmovdqa 3440(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm6
+; AVX-NEXT:    vmovapd 128(%rdi), %ymm10
+; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = mem[0,1,2,3],xmm13[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3]
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3]
-; AVX-NEXT:    vblendpd {{.*#+}} xmm6 = mem[0],xmm13[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
-; AVX-NEXT:    vmovapd 352(%rdi), %ymm13
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3]
-; AVX-NEXT:    vmovapd 256(%rdi), %xmm6
-; AVX-NEXT:    vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = xmm6[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
+; AVX-NEXT:    vmovapd 352(%rdi), %ymm7
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovdqa 256(%rdi), %xmm9
+; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm12[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 576(%rdi), %ymm6
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3]
-; AVX-NEXT:    vmovapd 480(%rdi), %xmm0
-; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm5 = xmm0[0],xmm11[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload
-; AVX-NEXT:    vmovapd 800(%rdi), %ymm11
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3]
-; AVX-NEXT:    vmovapd 704(%rdi), %xmm5
-; AVX-NEXT:    vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd $2, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm5[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vmovapd 1024(%rdi), %ymm5
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3]
-; AVX-NEXT:    vmovapd 928(%rdi), %xmm0
-; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm2 = xmm0[0],xmm9[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm1, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT:    vmovaps 1248(%rdi), %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 1152(%rdi), %xmm0
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = xmm0[0,1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT:    vmovaps 1472(%rdi), %ymm9
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 1376(%rdi), %xmm15
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = xmm15[0,1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT:    vmovapd 1696(%rdi), %ymm0
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovapd 480(%rdi), %xmm1
+; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm1[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
-; AVX-NEXT:    vmovapd 1600(%rdi), %xmm0
-; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm2 = xmm0[0],xmm7[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT:    vmovaps 1920(%rdi), %ymm7
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vmovaps 1824(%rdi), %xmm12
-; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = xmm12[0,1],mem[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vmovapd 800(%rdi), %ymm12
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovapd 704(%rdi), %xmm13
+; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm13[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX-NEXT:    vmovapd 1024(%rdi), %ymm2
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovapd 928(%rdi), %xmm1
+; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm1[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps 1248(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT:    vmovapd 2144(%rdi), %ymm10
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3]
-; AVX-NEXT:    vmovapd 2048(%rdi), %xmm0
-; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = xmm0[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
-; AVX-NEXT:    vmovapd 2368(%rdi), %ymm3
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3]
-; AVX-NEXT:    vmovapd 2272(%rdi), %xmm0
-; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = xmm0[0],mem[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT:    vmovapd 2592(%rdi), %ymm0
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 1152(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT:    vmovapd 1472(%rdi), %ymm3
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovapd 1376(%rdi), %xmm1
+; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm1[0],mem[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
-; AVX-NEXT:    vmovapd 2496(%rdi), %xmm0
-; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} xmm2 = xmm0[0],xmm14[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
-; AVX-NEXT:    vmovapd 2816(%rdi), %ymm2
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3]
-; AVX-NEXT:    vmovapd 2720(%rdi), %xmm1
+; AVX-NEXT:    vmovapd 1696(%rdi), %ymm8
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovapd 1600(%rdi), %xmm1
 ; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = xmm1[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX-NEXT:    vmovapd 1920(%rdi), %ymm4
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovdqa 1824(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX-NEXT:    vmovaps 3040(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 2144(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT:    vmovaps 2944(%rdi), %xmm1
+; AVX-NEXT:    vmovaps 2048(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm0
+; AVX-NEXT:    vmovapd 2368(%rdi), %ymm5
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3]
+; AVX-NEXT:    vmovdqa 2272(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT:    vmovaps 2592(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 2496(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX-NEXT:    vmovapd 3264(%rdi), %ymm14
+; AVX-NEXT:    vmovapd 2816(%rdi), %ymm14
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3]
-; AVX-NEXT:    vmovapd 3168(%rdi), %xmm1
+; AVX-NEXT:    vmovapd 2720(%rdi), %xmm1
 ; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = xmm1[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT:    vmovaps 3040(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 2944(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT:    vmovaps 3264(%rdi), %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT:    vmovaps 3168(%rdi), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovaps 3488(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
@@ -13483,21 +13510,19 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 192(%rdi), %ymm1
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
+; AVX-NEXT:    vmovapd 192(%rdi), %ymm0
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 544(%rdi), %xmm0
@@ -13510,11 +13535,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 864(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 768(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 992(%rdi), %xmm0
@@ -13522,7 +13546,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1312(%rdi), %ymm1
@@ -13536,28 +13560,30 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1440(%rdi), %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1536(%rdi), %ymm15
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[1],ymm9[3],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[3],ymm15[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 1760(%rdi), %ymm1
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
+; AVX-NEXT:    vmovapd 1760(%rdi), %ymm0
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1664(%rdi), %xmm13
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1888(%rdi), %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1984(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm4[0],ymm11[1],ymm4[3],ymm11[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2208(%rdi), %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[3],ymm12[2]
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[3],ymm12[2]
 ; AVX-NEXT:    vmovdqa 2112(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
@@ -13567,7 +13593,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 2432(%rdi), %ymm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm5[0],ymm9[1],ymm5[3],ymm9[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2656(%rdi), %ymm8
@@ -13582,7 +13608,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 2880(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm14[0],ymm5[1],ymm14[3],ymm5[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 3104(%rdi), %ymm6
@@ -13597,7 +13623,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 3328(%rdi), %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[1],ymm14[3],ymm3[2]
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 3552(%rdi), %ymm2
@@ -13618,7 +13645,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT:    vmovaps (%rsp), %xmm14 # 16-byte Reload
 ; AVX-NEXT:    vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = mem[0,1],xmm14[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
@@ -13854,7 +13881,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %ymm1, 192(%r9)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm1, 160(%r9)
-; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm1, 128(%r9)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm1, 96(%r9)
@@ -13920,7 +13947,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, (%rax)
-; AVX-NEXT:    addq $4232, %rsp # imm = 0x1088
+; AVX-NEXT:    addq $4248, %rsp # imm = 0x1098
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -18242,950 +18269,938 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-FCP-LABEL: load_i64_stride7_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $7624, %rsp # imm = 0x1DC8
-; AVX512-FCP-NEXT:    vmovdqa64 3328(%rdi), %zmm16
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 3264(%rdi), %zmm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm19
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm20
-; AVX512-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
+; AVX512-FCP-NEXT:    subq $6664, %rsp # imm = 0x1A08
+; AVX512-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm18
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm15
+; AVX512-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm26
 ; AVX512-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm17
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm11
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm13
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm10
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
+; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm16
+; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm15
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa 464(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 1360(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 912(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 2256(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 1808(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
+; AVX512-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512-FCP-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 2688(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 448(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %ymm1, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 960(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 896(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 2304(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 2240(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 1856(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 1792(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 3136(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpermi2q %ymm0, %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm25, %zmm0
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
+; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm0
+; AVX512-FCP-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 2816(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [4,11]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm13, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm13, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm17
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm25, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermi2q %zmm16, %zmm8, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa 3152(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 2816(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm20
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm13, %zmm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm24
+; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm30
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm9, %zmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
-; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm11, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
-; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm2
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 576(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm7
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm13, %zmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm29
+; AVX512-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm22
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm25, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm7, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm16
-; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm11, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm23
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm13
-; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm24
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm18
+; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm13, %zmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm27
+; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm25, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm7, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
+; AVX512-FCP-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 2368(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 2304(%rdi), %zmm21
+; AVX512-FCP-NEXT:    vmovdqa64 2240(%rdi), %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm13, %zmm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 2560(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 2496(%rdi), %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 2624(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm9
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm4, %zmm5
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm19
-; AVX512-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm9, %zmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm17
-; AVX512-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm3
+; AVX512-FCP-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 1920(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm8
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
-; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm9, %zmm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm29
-; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm10
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm13, %zmm10
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 2112(%rdi), %zmm31
+; AVX512-FCP-NEXT:    vmovdqa64 2048(%rdi), %zmm10
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm11, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm10
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa 2368(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 2304(%rdi), %zmm18
-; AVX512-FCP-NEXT:    vmovdqa64 2240(%rdi), %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm25, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 2176(%rdi), %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm9, %zmm10
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 2560(%rdi), %zmm20
-; AVX512-FCP-NEXT:    vmovdqa64 2496(%rdi), %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm11, %zmm14
-; AVX512-FCP-NEXT:    vmovdqa64 2624(%rdi), %zmm21
-; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm4, %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm14, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa 1920(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm22
-; AVX512-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm9, %zmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 2112(%rdi), %zmm26
-; AVX512-FCP-NEXT:    vmovdqa64 2048(%rdi), %zmm15
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm11, %zmm15
-; AVX512-FCP-NEXT:    vmovdqa64 2176(%rdi), %zmm28
-; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm4, %zmm15
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm15, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 3200(%rdi), %zmm25
-; AVX512-FCP-NEXT:    vmovdqa64 3136(%rdi), %zmm14
-; AVX512-FCP-NEXT:    vpermi2q %zmm25, %zmm14, %zmm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm11, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm10
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 3200(%rdi), %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 3136(%rdi), %zmm1
+; AVX512-FCP-NEXT:    vpermi2q %zmm10, %zmm1, %zmm13
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm25, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm25, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm19
+; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm25, %zmm8
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm11, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm25, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm25, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm25, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm25, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
-; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm11, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm11, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm11, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 3456(%rdi), %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm11, %zmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm9
-; AVX512-FCP-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 3456(%rdi), %zmm9
-; AVX512-FCP-NEXT:    vmovdqa64 3392(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 3392(%rdi), %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm25
+; AVX512-FCP-NEXT:    vmovdqa64 3520(%rdi), %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermi2q %zmm9, %zmm0, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa64 3520(%rdi), %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm4, %zmm11
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm11, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 2880(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm11 = [5,12]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm4
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm11, %zmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0]
-; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm16
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm16, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 640(%rdi), %ymm10
-; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm11, %zmm10
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm10
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm10
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm25
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm25, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 2880(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [5,12]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm6, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm25
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [9,0,7,0,9,0,7,0]
+; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm2, %zmm12
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 640(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm11
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm6, %zmm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm26
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm2, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm3, %zmm11
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm11, %zmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm12
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm6, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm3, %zmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm5, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm11, %zmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm30, %zmm11, %zmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q %zmm23, %zmm2, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 2432(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm11, %zmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm4, %zmm2
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm15
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm21 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 1984(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm11, %zmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm4, %zmm2
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 3328(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermi2q %zmm25, %zmm14, %zmm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm7
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm18
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm22
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm0, %zmm25
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm25
-; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm9, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
-; AVX512-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm23, %zmm12
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
-; AVX512-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm24, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
-; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm8, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermi2q %zmm10, %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm2, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm2, %zmm20
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm10
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermi2q %zmm15, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
-; AVX512-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [6,13]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512-FCP-NEXT:    movb $24, %al
+; AVX512-FCP-NEXT:    kmovw %eax, %k1
+; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm1
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm4[4,5,4,5]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
+; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
-; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm1, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm24, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3]
+; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm19, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm8, %zmm0
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4]
+; AVX512-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm20, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [12,5,12,5,12,5,12,5]
+; AVX512-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm26, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm1, %zmm28
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm23, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm24, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm8, %zmm0
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [13,6,13,6,13,6,13,6]
+; AVX512-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm25, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
+; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm7, %zmm17
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,8,15,4,5,8,15]
+; AVX512-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm17
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm22
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm9
-; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm23, %zmm9
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm24, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm19, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm8, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm20, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm26, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm1, %zmm30
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm23, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm24, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm21
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm8, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm16, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm30
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm23, %zmm30
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm24, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm8, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm16, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm1, %zmm11
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm28
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm23, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm3
-; AVX512-FCP-NEXT:    vpermi2q %zmm25, %zmm6, %zmm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm7, %zmm30
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm27
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm24, %zmm27
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm14, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm12, %zmm30
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm19, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm20, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm26, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm25, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermi2q %zmm25, %zmm31, %zmm24
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm25
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm8, %zmm25
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm31, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm16, %zmm0
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm31, %zmm16
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm15
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm31
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
-; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm10
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm22
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm3[4,5,4,5]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm30
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm7
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm28
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm23
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
-; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm27
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm18
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm19
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm21
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm22
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm24
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
-; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm25
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
-; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm29
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm16
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
-; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm18
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm19
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm21
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm22
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm11
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm31
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    movb $24, %al
-; AVX512-FCP-NEXT:    kmovw %eax, %k2
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5]
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
-; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm20
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm15, %zmm20
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm22
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm15, %zmm29
-; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm19, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm20, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm17
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm15, %zmm17
-; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm3, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm15, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm26, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm15, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm25, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm7, %zmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm17
+; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm15, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm17[4,5,4,5]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5]
+; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm14, %zmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm26
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm15, %zmm26
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm12, %zmm23
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [6,13]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm28
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm19, %zmm28
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm20, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT:    movb $-32, %al
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm25, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm7, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm19, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm20, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm26, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm25, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm7, %zmm23
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm30
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm19, %zmm30
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm16
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm15, %zmm19
+; AVX512-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm15
+; AVX512-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm20, %zmm24
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm9
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm6 {%k1}
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm25, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm22
+; AVX512-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm25, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm27
+; AVX512-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm25, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 3328(%rdi), %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 3264(%rdi), %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm25, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm20
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm18
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm26, %zmm18
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm26
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm10
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm10
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm25
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm16
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm13
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,9,0,5,6,9]
+; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm7, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm28
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm30
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm7, %zmm19
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,10,0,5,6,10]
+; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm24
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm7, %zmm20
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,11,0,5,6,11]
+; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm18
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm7, %zmm26
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12]
+; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm21
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm10
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm7, %zmm25
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm4
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm12, %zmm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm12, %zmm16
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm12, %zmm13
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [14,0,0,7,14,0,0,7]
+; AVX512-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm7 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm17, %zmm12, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm11[4,5,4,5],zmm22[4,5,4,5]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm12, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,4,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm1, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm9[4,5,4,5],zmm27[4,5,4,5]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm17
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm12, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm21
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm0, %zmm21
+; AVX512-FCP-NEXT:    vpermt2q %zmm27, %zmm1, %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm5[4,5,4,5],zmm8[4,5,4,5]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm12, %zmm22
+; AVX512-FCP-NEXT:    vpermi2q %zmm8, %zmm5, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm27
+; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm0, %zmm27
+; AVX512-FCP-NEXT:    vpermi2q %zmm5, %zmm8, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k1} = zmm14[4,5,4,5],zmm15[4,5,4,5]
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm14
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm23, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    movb $-32, %al
+; AVX512-FCP-NEXT:    kmovw %eax, %k2
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k2}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 2752(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm3, %zmm26, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa 512(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm20, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
-; AVX512-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm29, %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm17, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 960(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm8, %zmm9, %zmm8
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k1}
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinsertf64x4 $0, %ymm4, %zmm9, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa 2304(%rdi), %ymm10
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm9, %zmm10
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 1856(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm9, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12 {%k1}
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinsertf64x4 $0, %ymm11, %zmm9, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm13
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm15, %zmm9
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, 448(%rsi)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm13, 384(%rsi)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm13, 320(%rsi)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm13, 256(%rsi)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm13, 192(%rsi)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm13, 128(%rsi)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm13, 64(%rsi)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm13, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 448(%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, 320(%rdx)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, 128(%rdx)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, 192(%rdx)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, (%rdx)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, 64(%rdx)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, 384(%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 448(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 256(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, 320(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 64(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 384(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 448(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 256(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 320(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 128(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 192(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 64(%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 384(%r8)
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm28 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm3 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm3 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm24 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm24 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm23 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm23 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm30
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm4 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm4 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm9 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k2}
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm13 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k2}
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k2}
+; AVX512-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm12 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm11
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm27, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm15
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm15, %zmm3, %zmm15
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm3, %xmm16
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm16, %zmm3, %zmm16
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm3, %xmm17
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm3, %zmm17
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm3, %xmm19
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm19, %zmm3, %zmm19
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm3, %xmm20
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm20, %zmm3, %zmm7
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm3, %xmm20
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm20, %zmm21, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm18 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm3 {%k2}
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
+; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm20
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm20, %zmm18, %zmm18
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k2}
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm20, 448(%rsi)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm20, 384(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, 320(%rsi)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm20, 256(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 192(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, 320(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, 128(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, 192(%rdx)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rdx)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 64(%rdx)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 384(%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 448(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 256(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 320(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 192(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 384(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 448(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 256(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 320(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 128(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 192(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, 64(%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 384(%r8)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm3, 448(%r9)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
@@ -19203,35 +19218,40 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm3, 384(%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, 448(%rax)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, 256(%rax)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, 320(%rax)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, 128(%rax)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, 192(%rax)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, (%rax)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm2, 64(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm3, 384(%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovaps %zmm11, 384(%rax)
-; AVX512-FCP-NEXT:    vmovaps %zmm4, 448(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 320(%rax)
-; AVX512-FCP-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, 192(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 192(%rax)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rax)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512-FCP-NEXT:    addq $7624, %rsp # imm = 0x1DC8
+; AVX512-FCP-NEXT:    addq $6664, %rsp # imm = 0x1A08
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -20232,950 +20252,938 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $7624, %rsp # imm = 0x1DC8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 3328(%rdi), %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 3264(%rdi), %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT:    subq $6664, %rsp # imm = 0x1A08
+; AVX512DQ-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm15
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 2688(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 448(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm1, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 960(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 896(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 2304(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 2240(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 1856(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 1792(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 3136(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpermi2q %ymm0, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 464(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 1360(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 912(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 2256(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 1808(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 2816(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [4,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm13, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm13, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm16, %zmm8, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 3152(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 2816(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm13, %zmm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm9, %zmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 576(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm13, %zmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm25, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm7
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm7, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm9, %zmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm11, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm24
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm13, %zmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm25, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm7
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm7, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 2368(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2304(%rdi), %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2240(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm13, %zmm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2560(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2496(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2624(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm9
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm4, %zmm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm9, %zmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 1920(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm9, %zmm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm29
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm13, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2112(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2048(%rdi), %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm11, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm10
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 2368(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2304(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2240(%rdi), %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm25, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2176(%rdi), %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm9, %zmm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2560(%rdi), %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2496(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm11, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2624(%rdi), %zmm21
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm4, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm14, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 1920(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm9, %zmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2112(%rdi), %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2048(%rdi), %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm11, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2176(%rdi), %zmm28
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm4, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm15, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 3200(%rdi), %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 3136(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm25, %zmm14, %zmm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 3200(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 3136(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm10, %zmm1, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm25, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm19
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm25, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm11, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm25, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 3456(%rdi), %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm11, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm9
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 3456(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 3392(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 3392(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 3520(%rdi), %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm9, %zmm0, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 3520(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm4, %zmm11
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 2880(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm11 = [5,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm10, %zmm16, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 640(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm11, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm5, %zmm25
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm25, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 2880(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm6 = [5,12]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm6, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm2, %zmm12
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 640(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm11
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm6, %zmm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm2, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm3, %zmm11
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm3, %zmm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm5, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm30, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm23, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 2432(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm4, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm21 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 1984(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm4, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 3328(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm10, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm2, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm2, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [6,13]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-FCP-NEXT:    movb $24, %al
+; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm4[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 3328(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm25, %zmm14, %zmm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm0, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm25
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm9, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm19, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm23, %zmm12
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm24, %zmm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm20, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm26, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm25, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
-; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm24, %zmm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm7, %zmm17
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm17
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm19, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm20, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm26, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm7, %zmm30
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm14, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm12, %zmm30
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm1, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm24, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm19, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm20, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm26, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm9
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm23, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm24, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm25, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm3[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm1, %zmm30
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm23, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm24, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm8, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm16, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm22
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm30
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm23, %zmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm24, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm1, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm23, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm25, %zmm6, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm20, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm27
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm24, %zmm27
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm26, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm25, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm7, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm17[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm12, %zmm23
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm28
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm19, %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm20, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm25, %zmm31, %zmm24
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm8, %zmm25
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm31, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm31, %zmm16
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm31
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm30
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm23
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm27
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm24
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm25, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm29
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm16
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
-; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm7, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm19, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm26, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm25, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm7, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm30
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm19, %zmm30
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm16
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm15, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm20, %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    movb $24, %al
-; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm25, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm25, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm25, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 3328(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 3264(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm25, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm26, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm10
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm10
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm25
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm16
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm13
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm20
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm15, %zmm20
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm15, %zmm29
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm17
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm15, %zmm17
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm15, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm7, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm15, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm15, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm7, %zmm19
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm24
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm26
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm7, %zmm20
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm7, %zmm26
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm15, %zmm26
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [6,13]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512DQ-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    movb $-32, %al
-; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm7, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm7, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm29, %zmm12, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm12, %zmm23
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm12, %zmm16
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm12, %zmm13
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm17, %zmm12, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm11[4,5,4,5],zmm22[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,4,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm1, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm9[4,5,4,5],zmm27[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm17
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm12, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm21
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm0, %zmm21
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm27, %zmm1, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm5[4,5,4,5],zmm8[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm12, %zmm22
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm8, %zmm5, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm27
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm0, %zmm27
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm5, %zmm8, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k1} = zmm14[4,5,4,5],zmm15[4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm14
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    movb $-32, %al
+; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm7 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k2}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512DQ-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 2752(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm3, %zmm26, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 512(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm20, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm29, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm17, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 960(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm8, %zmm9, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinsertf64x4 $0, %ymm4, %zmm9, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 2304(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm9, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 1856(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm9, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinsertf64x4 $0, %ymm11, %zmm9, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm15, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, 448(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm13, 384(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm13, 320(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm13, 256(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm13, 192(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm13, 128(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm13, 64(%rsi)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm13, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 448(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, 320(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 128(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 192(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 64(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 384(%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, 448(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, 256(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, 320(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 64(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 384(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 448(%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 256(%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 320(%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 128(%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 192(%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 64(%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 384(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm28 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm24 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm24 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm23 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm23 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm30
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm4 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm4 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm9 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k2}
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm13 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k2}
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k2}
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm12 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm11
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm27, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm15
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm15, %zmm3, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm3, %xmm16
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm16, %zmm3, %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm3, %xmm17
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm3, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm3, %xmm19
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm19, %zmm3, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm3, %xmm20
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm20, %zmm3, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm3, %xmm20
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm20, %zmm21, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm18 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm20
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm20, %zmm18, %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm20, 448(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm20, 384(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, 320(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm20, 256(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, 192(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, 320(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, 128(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, 192(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 64(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 384(%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 448(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 256(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 320(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 192(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 384(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 448(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 256(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 320(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 128(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 192(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 64(%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, 384(%r8)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 448(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
@@ -21193,35 +21201,40 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 384(%r9)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm3, 384(%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm11, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm4, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 192(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQ-FCP-NEXT:    addq $7624, %rsp # imm = 0x1DC8
+; AVX512DQ-FCP-NEXT:    addq $6664, %rsp # imm = 0x1A08
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -22206,932 +22219,949 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512BW-FCP-LABEL: load_i64_stride7_vf64:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    subq $7624, %rsp # imm = 0x1DC8
-; AVX512BW-FCP-NEXT:    vmovdqa64 3328(%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    subq $6792, %rsp # imm = 0x1A88
+; AVX512BW-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm29
+; AVX512BW-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 3264(%rdi), %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm18
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm27
+; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm25
-; AVX512BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm18
-; AVX512BW-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm30
-; AVX512BW-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm22
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm25
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm15
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,7,14,0,0,7,14,0]
+; AVX512BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm24, %zmm1
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13]
+; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm8
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa 2816(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [4,11]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm24, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa 576(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm23, %zmm12, %zmm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm24, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm25
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm12, %zmm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm15
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 464(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 1360(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 912(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm3
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm24, %zmm3
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm10, %zmm3
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
+; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm12, %zmm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm28
+; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm24, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm10, %zmm4
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa 2368(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 2304(%rdi), %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqa64 2240(%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 2256(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm12, %zmm4
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 2560(%rdi), %zmm18
+; AVX512BW-FCP-NEXT:    vmovdqa64 2496(%rdi), %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm24, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa64 2624(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm10, %zmm6
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa 1920(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm12, %zmm6
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 2112(%rdi), %zmm26
+; AVX512BW-FCP-NEXT:    vmovdqa64 2048(%rdi), %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm24, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqa64 2176(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm10, %zmm13
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm13, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm13
+; AVX512BW-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm13[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 3200(%rdi), %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 3136(%rdi), %zmm6
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm11, %zmm6, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm24, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm23, %zmm24, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm24, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm24, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm24, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm24, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa 1808(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm24, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm24, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 3456(%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm4, %zmm6, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa 3152(%rdi), %xmm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 2816(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm9, %zmm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
+; AVX512BW-FCP-NEXT:    vmovdqa64 3392(%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm4, %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 3520(%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm10, %zmm24
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm12, %zmm24, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa 2880(%rdi), %ymm10
+; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [5,12]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm9, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0]
 ; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm16
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm29, %zmm10, %zmm16
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
+; AVX512BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm12, %zmm16
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm13, %zmm16, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
-; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa 640(%rdi), %ymm13
+; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm7 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm27
-; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm9, %zmm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm10, %zmm13
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm12, %zmm13
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm13, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm31
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm9, %zmm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm18
-; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm29
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm7
+; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm9, %zmm7
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm10, %zmm7
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm7 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm26
-; AVX512BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm7
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm9, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm9, %zmm1
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm9, %zmm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm22
-; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm9, %zmm1
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 2368(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 2304(%rdi), %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 2240(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa 2432(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm9, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm9, %zmm1
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 2560(%rdi), %zmm15
-; AVX512BW-FCP-NEXT:    vmovdqa64 2496(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 2624(%rdi), %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 1920(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm21
-; AVX512BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm9, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa 1984(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm9, %zmm1
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 2112(%rdi), %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqa64 2048(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 2176(%rdi), %zmm23
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm23, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 3200(%rdi), %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 3136(%rdi), %zmm11
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm17, %zmm11, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm9
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm10, %zmm6
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm10, %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm10, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm10, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm6
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm10, %zmm11
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm10, %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm10, %zmm25
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm10, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm10, %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm10, %zmm22
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 3456(%rdi), %zmm24
-; AVX512BW-FCP-NEXT:    vmovdqa64 3392(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm24, %zmm2, %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqa64 3520(%rdi), %zmm30
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm4, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm5, %zmm15, %zmm10
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vmovdqa 3328(%rdi), %ymm0
+; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm11, %zmm1, %zmm9
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 2880(%rdi), %ymm0
-; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [5,12]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm9
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
-; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm12
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm4, %zmm12
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
-; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 640(%rdi), %ymm9
-; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm4, %zmm12
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm12
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm29, %zmm4, %zmm12
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm9
-; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm16
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm4, %zmm12
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm7
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm9
-; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm29, %zmm4, %zmm12
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 2432(%rdi), %ymm9
-; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm0, %zmm9
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm9, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 1984(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm23, %zmm10, %zmm3
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 3328(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm17, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [10,3,10,3,10,3,10,3]
+; AVX512BW-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm29, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [11,4,11,4,11,4,11,4]
+; AVX512BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm3, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm4, %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm4, %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm4, %zmm21
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm8, %zmm24, %zmm4
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm4
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5]
+; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [13,6,13,6,13,6,13,6]
+; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
-; AVX512BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm23, %zmm25
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4]
-; AVX512BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm27, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5]
-; AVX512BW-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm31, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6]
-; AVX512BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
 ; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm29, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm23, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm9, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm27, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm25, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm31, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm22
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm29, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm12, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm3, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm27, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm9, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm31, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm25, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm12, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm1, %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm29, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm1, %zmm26
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm9
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm23, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm27, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm3, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm31, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm12, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm1, %zmm29
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm25, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm1, %zmm30
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm13
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm29, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm3, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm9, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm25, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm1, %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm11
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm29, %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm12
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm3, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm19
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm25, %zmm19
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm1, %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm29, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm22
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm15, %zmm5, %zmm29
+; AVX512BW-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm27, %zmm4, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm30
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm26
+; AVX512BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm23, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 3328(%rdi), %zmm15
+; AVX512BW-FCP-NEXT:    vmovdqa64 3264(%rdi), %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm5
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm27
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm22, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm6
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm22, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm22, %zmm25
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm27
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm28
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
+; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm10
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm27, %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm13
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm31, %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm12, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm21
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm23, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm22
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm27, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm31, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm12, %zmm7
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm23, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm29
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm24, %zmm8, %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm27, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm12, %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm29
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
+; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm12, %zmm8
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm24, %zmm29, %zmm27
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm31, %zmm0
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm24, %zmm29, %zmm31
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm8
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm12, %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm18
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm24, %zmm29, %zmm12
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm29
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm26
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
-; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm25
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm1, %zmm22
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm1, %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm1, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm1, %zmm23
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
-; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm1, %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm1, %zmm27
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm1, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
 ; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm9
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm1, %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm1, %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm1, %zmm31
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
 ; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm1, %zmm18
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm16
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm16
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm1, %zmm7
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm1, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm1, %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm25
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
 ; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm22
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm16
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm18
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm16
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm1, %zmm21
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm1, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm1, %zmm29
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm1, %zmm26
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm1, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm27
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm28
 ; AVX512BW-FCP-NEXT:    movb $24, %al
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0]
-; AVX512BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm2, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm1[4,5,4,5],zmm31[4,5,4,5]
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [14,0,0,7,14,0,0,7]
+; AVX512BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm17, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm1[4,5,4,5],zmm30[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm17, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm1[4,5,4,5],zmm26[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm30
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm17, %zmm30
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm1[4,5,4,5],zmm24[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm17, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[4,5,4,5],zmm23[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm24
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm23, %zmm17, %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm23, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm30, %zmm28
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm15, %zmm6, %zmm30
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm15
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm1[4,5,4,5],zmm10[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm17, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm14[4,5,4,5],zmm15[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm17, %zmm21
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm15, %zmm14, %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm23
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm23
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm14, %zmm15, %zmm18
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[4,5,4,5],zmm2[4,5,4,5]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [6,13]
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm0 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm17
-; AVX512BW-FCP-NEXT:    vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vpblendd $240, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload
 ; AVX512BW-FCP-NEXT:    # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm29
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm28, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    movb $-32, %al
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm9 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm25
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm0 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm0, %zmm28, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa 512(%rdi), %ymm0
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-FCP-NEXT:    vmovdqa 2688(%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm14 = [2,5]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm0, %ymm14, %ymm9
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm9, %zmm21, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 512(%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqa 448(%rdi), %ymm10
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm9, %ymm14, %ymm10
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm1, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm19 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm11, %ymm14, %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm9, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm11
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm9, %ymm14, %ymm11
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm30, %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 960(%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqa 896(%rdi), %ymm12
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm16 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm9, %ymm14, %ymm12
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm9, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 2304(%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 2240(%rdi), %ymm21
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm27 = ymm21[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm21[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm9, %ymm14, %ymm21
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm21, %zmm24, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm9 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 1856(%rdi), %ymm21
+; AVX512BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %ymm22
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm28 = ymm22[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],ymm22[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermt2q %ymm21, %ymm14, %ymm22
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm22, %zmm13, %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm21 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 3200(%rdi), %ymm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 3136(%rdi), %ymm24
+; AVX512BW-FCP-NEXT:    vpermi2q %ymm22, %ymm24, %ymm14
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm14, %zmm17, %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm13
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm23, %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm0, %zmm3, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm3, %zmm4, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm4
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm5, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa 960(%rdi), %ymm5
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm6, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa 2304(%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm7, %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa 1856(%rdi), %ymm7
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm7, %zmm8, %zmm7
-; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm8
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm8, %zmm30, %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm8 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, 448(%rsi)
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovaps %zmm12, 384(%rsi)
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovaps %zmm12, 320(%rsi)
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovaps %zmm12, 256(%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, 192(%rsi)
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovaps %zmm12, 128(%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rsi)
-; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-FCP-NEXT:    vmovaps %zmm12, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 448(%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, 192(%rdx)
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm15, %zmm3, %zmm15
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm17
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm3, %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm6, %xmm19
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm19, %zmm3, %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm16, %zmm3, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm27, %xmm20
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm20, %zmm3, %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm20 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm28, %xmm23
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm23, %zmm3, %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} ymm22 = ymm24[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],ymm24[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm22, %xmm22
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm22, %zmm18, %zmm18
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, 448(%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 384(%rsi)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm3, 320(%rsi)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm3, 256(%rsi)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm3, 192(%rsi)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm3, 128(%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, 64(%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, 256(%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm2, 192(%rdx)
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm2, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm2, 64(%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 384(%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, 448(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, 256(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, 320(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, 128(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 192(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 448(%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, 256(%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 320(%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 128(%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 192(%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 384(%r8)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm2, 384(%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, 448(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, 256(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 320(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, 128(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, 192(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 64(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 384(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 448(%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, 256(%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, 320(%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, 128(%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 192(%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, 64(%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, 384(%r8)
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 448(%r9)
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -23168,18 +23198,21 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, 448(%rax)
+; AVX512BW-FCP-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, 320(%rax)
-; AVX512BW-FCP-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, (%rax)
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovaps %zmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-FCP-NEXT:    addq $7624, %rsp # imm = 0x1DC8
+; AVX512BW-FCP-NEXT:    addq $6792, %rsp # imm = 0x1A88
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -24164,932 +24197,949 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    subq $7624, %rsp # imm = 0x1DC8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3328(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    subq $6792, %rsp # imm = 0x1A88
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm29
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3264(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm30
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm24, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2816(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm12 = [4,11]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm24, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 576(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm23, %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm24, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 464(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1360(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 912(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2256(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1808(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm24, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm10, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm28
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm24, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm10, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2368(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2304(%rdi), %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2240(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm4, %zmm6, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 3152(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm12, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2560(%rdi), %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2496(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm24, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2624(%rdi), %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2816(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm9, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm10, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1920(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm12, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2112(%rdi), %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2048(%rdi), %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm24, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2176(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm10, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm13, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm13[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3200(%rdi), %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3136(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm11, %zmm6, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm24, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm27
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm9, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm23, %zmm24, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm24, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm24, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm24, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm24, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm24, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm24, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3456(%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3392(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm4, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3520(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm10, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm12, %zmm24, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2880(%rdi), %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm9 = [5,12]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm9, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm9, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm29, %zmm10, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm12, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm13, %zmm16, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 640(%rdi), %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm7 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm26
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm9, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm23, %zmm9, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm10, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm12, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm13, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm9, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm10, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm9, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm9, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm9, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2368(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2304(%rdi), %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2240(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2432(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm9, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm9, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2560(%rdi), %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2496(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2624(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1920(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1856(%rdi), %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm9, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1984(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm9, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2112(%rdi), %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2048(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2176(%rdi), %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm23, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3200(%rdi), %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3136(%rdi), %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm17, %zmm11, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm10, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm10, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm10, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm10, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm10, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm10, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm10, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm10, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm10, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm10, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3456(%rdi), %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3392(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm24, %zmm2, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3520(%rdi), %zmm30
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm4, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm5, %zmm15, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 3328(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm11, %zmm1, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2880(%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [5,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm4, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 640(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm4, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm29, %zmm4, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1536(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm4, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm29, %zmm4, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2432(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm0, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm9, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1984(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm23, %zmm10, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 3328(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm17, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-BW-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm29, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm3, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm4, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm4, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm4, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm8, %zmm24, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm10, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm23, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-BW-FCP-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm27, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-BW-FCP-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm31, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm12, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm29, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm23, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm27, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm9, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm31, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm25, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm12, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm29, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm23, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm27, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm3, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm31, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm9, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm12, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm25, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm1, %zmm26
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm23, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm27, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm1, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm29, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm31, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm12, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm1, %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm27, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm31, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm12, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm3, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm1, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm23, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm27, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm31, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm12, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm23, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm24, %zmm8, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm9, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm27, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm12, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm12, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm24, %zmm29, %zmm27
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm31, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm24, %zmm29, %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm12, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm24, %zmm29, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm25, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm1, %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm29, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm3, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm9, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm25, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm18, %zmm1, %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm29, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm3, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm25, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm1, %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm29, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm15, %zmm5, %zmm29
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm27, %zmm4, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm23, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3328(%rdi), %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3264(%rdi), %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm22, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm22, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm22, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm1, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm1, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm1, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm1, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm29
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm1, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm1, %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm1, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm1, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm1, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm1, %zmm31
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm1, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm1, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm1, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm1, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm25
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm20, %zmm1, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm22
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm21, %zmm1, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm1, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm1, %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm1, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm1, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm1, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm1, %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm1, %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    movb $24, %al
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm1[4,5,4,5],zmm31[4,5,4,5]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm17, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm1[4,5,4,5],zmm30[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm17, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm1[4,5,4,5],zmm26[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm17, %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm1[4,5,4,5],zmm24[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm17, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[4,5,4,5],zmm23[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm23, %zmm17, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm23, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm30, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm15, %zmm6, %zmm30
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm1[4,5,4,5],zmm10[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm17, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm14[4,5,4,5],zmm15[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm17, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm15, %zmm14, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm14, %zmm15, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[4,5,4,5],zmm2[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [6,13]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-BW-FCP-NEXT:    # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm28, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    movb $-32, %al
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm9 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm0 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm0, %zmm28, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 512(%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2688(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm14 = [2,5]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm0, %ymm14, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm9, %zmm21, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 512(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 448(%rdi), %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm9, %ymm14, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm10, %zmm1, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm19 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm11, %ymm14, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm9, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm9, %ymm14, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm11, %zmm30, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 960(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 896(%rdi), %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm16 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm9, %ymm14, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm12, %zmm9, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2304(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2240(%rdi), %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm27 = ymm21[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm21[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm9, %ymm14, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm21, %zmm24, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm9 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1856(%rdi), %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm28 = ymm22[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],ymm22[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %ymm21, %ymm14, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm22, %zmm13, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm21 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3200(%rdi), %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3136(%rdi), %ymm24
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm22, %ymm24, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm14, %zmm17, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm13, %zmm23, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm0, %zmm3, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm3, %zmm4, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm4, %zmm5, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 960(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm5, %zmm6, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2304(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm6, %zmm7, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 1856(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm7, %zmm8, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm8, %zmm30, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm8 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, 448(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm12, 384(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm12, 320(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm12, 256(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, 192(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm12, 128(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm12, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 448(%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, 192(%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm15, %zmm3, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm17, %zmm3, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm6, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm19, %zmm3, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm16, %zmm3, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm27, %xmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm20, %zmm3, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm20 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm28, %xmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm23, %zmm3, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} ymm22 = ymm24[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],ymm24[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm22, %xmm22
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm22, %zmm18, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, 448(%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 384(%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm3, 320(%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm3, 256(%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm3, 192(%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm3, 128(%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, 64(%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, 256(%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm2, 192(%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm2, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm2, 64(%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 384(%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, 448(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, 256(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, 320(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, 128(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 192(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 448(%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, 256(%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 320(%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 128(%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 192(%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 384(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm2, 384(%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, 448(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, 256(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 320(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, 128(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, 192(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 64(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 384(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 448(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, 256(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, 320(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, 128(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 192(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, 64(%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, 384(%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 448(%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -25126,18 +25176,21 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, 448(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, 320(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    addq $7624, %rsp # imm = 0x1DC8
+; AVX512DQ-BW-FCP-NEXT:    addq $6792, %rsp # imm = 0x1A88
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <448 x i64>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
index 51b6222077f82..25c1ece991afc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
@@ -196,17 +196,18 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovaps 96(%rdi), %xmm2
 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512-NEXT:    vmovaps 112(%rdi), %xmm3
-; AVX512-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
-; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
+; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm4[0],mem[0]
+; AVX512-NEXT:    vpbroadcastq 120(%rdi), %xmm4
+; AVX512-NEXT:    valignq {{.*#+}} zmm4 = mem[7],zmm4[0,1,2,3,4,5,6]
 ; AVX512-NEXT:    vmovaps %xmm5, (%rsi)
 ; AVX512-NEXT:    vmovaps %xmm0, (%rdx)
 ; AVX512-NEXT:    vmovaps %xmm6, (%rcx)
 ; AVX512-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512-NEXT:    vmovaps %xmm7, (%r9)
 ; AVX512-NEXT:    vmovaps %xmm2, (%r11)
-; AVX512-NEXT:    vmovaps %xmm8, (%r10)
-; AVX512-NEXT:    vmovaps %xmm3, (%rax)
+; AVX512-NEXT:    vmovaps %xmm3, (%r10)
+; AVX512-NEXT:    vmovdqa %xmm4, (%rax)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i64_stride8_vf2:
@@ -214,30 +215,34 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-FCP-NEXT:    vmovaps 64(%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovaps (%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovaps 16(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vmovaps 32(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vmovaps 48(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
-; AVX512-FCP-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512-FCP-NEXT:    vmovaps 80(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512-FCP-NEXT:    vmovaps 96(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512-FCP-NEXT:    vmovaps 64(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %xmm3
+; AVX512-FCP-NEXT:    vmovaps 16(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vmovaps 32(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm6
 ; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
 ; AVX512-FCP-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512-FCP-NEXT:    vmovaps 112(%rdi), %xmm3
+; AVX512-FCP-NEXT:    vmovaps 80(%rdi), %xmm3
 ; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
 ; AVX512-FCP-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
-; AVX512-FCP-NEXT:    vmovaps %xmm5, (%rsi)
-; AVX512-FCP-NEXT:    vmovaps %xmm0, (%rdx)
-; AVX512-FCP-NEXT:    vmovaps %xmm6, (%rcx)
-; AVX512-FCP-NEXT:    vmovaps %xmm1, (%r8)
-; AVX512-FCP-NEXT:    vmovaps %xmm7, (%r9)
-; AVX512-FCP-NEXT:    vmovaps %xmm2, (%r11)
-; AVX512-FCP-NEXT:    vmovaps %xmm8, (%r10)
-; AVX512-FCP-NEXT:    vmovaps %xmm3, (%rax)
+; AVX512-FCP-NEXT:    vmovaps 96(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm4[0]
+; AVX512-FCP-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [0,14]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [7,15]
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512-FCP-NEXT:    vmovaps %xmm7, (%rsi)
+; AVX512-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512-FCP-NEXT:    vmovaps %xmm8, (%rcx)
+; AVX512-FCP-NEXT:    vmovaps %xmm3, (%r8)
+; AVX512-FCP-NEXT:    vmovaps %xmm9, (%r9)
+; AVX512-FCP-NEXT:    vmovaps %xmm4, (%r11)
+; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%r10)
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%rax)
+; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i64_stride8_vf2:
@@ -258,17 +263,18 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovaps 96(%rdi), %xmm2
 ; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
 ; AVX512DQ-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQ-NEXT:    vmovaps 112(%rdi), %xmm3
-; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
-; AVX512DQ-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
+; AVX512DQ-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm4[0],mem[0]
+; AVX512DQ-NEXT:    vpbroadcastq 120(%rdi), %xmm4
+; AVX512DQ-NEXT:    valignq {{.*#+}} zmm4 = mem[7],zmm4[0,1,2,3,4,5,6]
 ; AVX512DQ-NEXT:    vmovaps %xmm5, (%rsi)
 ; AVX512DQ-NEXT:    vmovaps %xmm0, (%rdx)
 ; AVX512DQ-NEXT:    vmovaps %xmm6, (%rcx)
 ; AVX512DQ-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512DQ-NEXT:    vmovaps %xmm7, (%r9)
 ; AVX512DQ-NEXT:    vmovaps %xmm2, (%r11)
-; AVX512DQ-NEXT:    vmovaps %xmm8, (%r10)
-; AVX512DQ-NEXT:    vmovaps %xmm3, (%rax)
+; AVX512DQ-NEXT:    vmovaps %xmm3, (%r10)
+; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rax)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf2:
@@ -276,30 +282,34 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-FCP-NEXT:    vmovaps 64(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovaps 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovaps 32(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovaps 48(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
-; AVX512DQ-FCP-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512DQ-FCP-NEXT:    vmovaps 80(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512DQ-FCP-NEXT:    vmovaps 96(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vmovaps 64(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovaps 16(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovaps 32(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm6
 ; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
 ; AVX512DQ-FCP-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQ-FCP-NEXT:    vmovaps 112(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovaps 80(%rdi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
 ; AVX512DQ-FCP-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm5, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm6, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm1, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm7, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm2, (%r11)
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm8, (%r10)
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm3, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovaps 96(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm4[0]
+; AVX512DQ-FCP-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [0,14]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [7,15]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm7, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm8, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm3, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm9, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovaps %xmm4, (%r11)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%r10)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%rax)
+; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride8_vf2:
@@ -320,17 +330,18 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovaps 96(%rdi), %xmm2
 ; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
 ; AVX512BW-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512BW-NEXT:    vmovaps 112(%rdi), %xmm3
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
-; AVX512BW-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
+; AVX512BW-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm4[0],mem[0]
+; AVX512BW-NEXT:    vpbroadcastq 120(%rdi), %xmm4
+; AVX512BW-NEXT:    valignq {{.*#+}} zmm4 = mem[7],zmm4[0,1,2,3,4,5,6]
 ; AVX512BW-NEXT:    vmovaps %xmm5, (%rsi)
 ; AVX512BW-NEXT:    vmovaps %xmm0, (%rdx)
 ; AVX512BW-NEXT:    vmovaps %xmm6, (%rcx)
 ; AVX512BW-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512BW-NEXT:    vmovaps %xmm7, (%r9)
 ; AVX512BW-NEXT:    vmovaps %xmm2, (%r11)
-; AVX512BW-NEXT:    vmovaps %xmm8, (%r10)
-; AVX512BW-NEXT:    vmovaps %xmm3, (%rax)
+; AVX512BW-NEXT:    vmovaps %xmm3, (%r10)
+; AVX512BW-NEXT:    vmovdqa %xmm4, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i64_stride8_vf2:
@@ -338,30 +349,34 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-FCP-NEXT:    vmovaps 64(%rdi), %xmm0
-; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %xmm1
-; AVX512BW-FCP-NEXT:    vmovaps 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vmovaps 32(%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vmovaps 48(%rdi), %xmm4
-; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
-; AVX512BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512BW-FCP-NEXT:    vmovaps 80(%rdi), %xmm1
-; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512BW-FCP-NEXT:    vmovaps 96(%rdi), %xmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vmovaps 64(%rdi), %xmm2
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %xmm3
+; AVX512BW-FCP-NEXT:    vmovaps 16(%rdi), %xmm4
+; AVX512BW-FCP-NEXT:    vmovaps 32(%rdi), %xmm5
+; AVX512BW-FCP-NEXT:    vmovdqa 48(%rdi), %xmm6
 ; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
 ; AVX512BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512BW-FCP-NEXT:    vmovaps 112(%rdi), %xmm3
+; AVX512BW-FCP-NEXT:    vmovaps 80(%rdi), %xmm3
 ; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
 ; AVX512BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
-; AVX512BW-FCP-NEXT:    vmovaps %xmm5, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovaps %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovaps %xmm6, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovaps %xmm1, (%r8)
-; AVX512BW-FCP-NEXT:    vmovaps %xmm7, (%r9)
-; AVX512BW-FCP-NEXT:    vmovaps %xmm2, (%r11)
-; AVX512BW-FCP-NEXT:    vmovaps %xmm8, (%r10)
-; AVX512BW-FCP-NEXT:    vmovaps %xmm3, (%rax)
+; AVX512BW-FCP-NEXT:    vmovaps 96(%rdi), %xmm4
+; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm4[0]
+; AVX512BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [0,14]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm6
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [7,15]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512BW-FCP-NEXT:    vmovaps %xmm7, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm8, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm3, (%r8)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm9, (%r9)
+; AVX512BW-FCP-NEXT:    vmovaps %xmm4, (%r11)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%r10)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%rax)
+; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i64_stride8_vf2:
@@ -382,17 +397,18 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovaps 96(%rdi), %xmm2
 ; AVX512DQ-BW-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
 ; AVX512DQ-BW-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQ-BW-NEXT:    vmovaps 112(%rdi), %xmm3
-; AVX512DQ-BW-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
-; AVX512DQ-BW-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
+; AVX512DQ-BW-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm4[0],mem[0]
+; AVX512DQ-BW-NEXT:    vpbroadcastq 120(%rdi), %xmm4
+; AVX512DQ-BW-NEXT:    valignq {{.*#+}} zmm4 = mem[7],zmm4[0,1,2,3,4,5,6]
 ; AVX512DQ-BW-NEXT:    vmovaps %xmm5, (%rsi)
 ; AVX512DQ-BW-NEXT:    vmovaps %xmm0, (%rdx)
 ; AVX512DQ-BW-NEXT:    vmovaps %xmm6, (%rcx)
 ; AVX512DQ-BW-NEXT:    vmovaps %xmm1, (%r8)
 ; AVX512DQ-BW-NEXT:    vmovaps %xmm7, (%r9)
 ; AVX512DQ-BW-NEXT:    vmovaps %xmm2, (%r11)
-; AVX512DQ-BW-NEXT:    vmovaps %xmm8, (%r10)
-; AVX512DQ-BW-NEXT:    vmovaps %xmm3, (%rax)
+; AVX512DQ-BW-NEXT:    vmovaps %xmm3, (%r10)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rax)
+; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf2:
@@ -400,30 +416,34 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-BW-FCP-NEXT:    vmovaps 64(%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovaps 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovaps 32(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovaps 48(%rdi), %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512DQ-BW-FCP-NEXT:    vmovaps 80(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512DQ-BW-FCP-NEXT:    vmovaps 96(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovaps 64(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovaps 16(%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovaps 32(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 48(%rdi), %xmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
 ; AVX512DQ-BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQ-BW-FCP-NEXT:    vmovaps 112(%rdi), %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovaps 80(%rdi), %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
 ; AVX512DQ-BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm5, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm6, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm1, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm7, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm2, (%r11)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm8, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps 96(%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm4[0]
+; AVX512DQ-BW-FCP-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [0,14]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [7,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm7, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm8, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm3, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm9, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm4, (%r11)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <16 x i64>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> <i32 0, i32 8>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index d1d7cb0a34332..592f67c72f1d4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -2110,6 +2110,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX-LABEL: load_i8_stride3_vf64:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2119,6 +2120,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm11
 ; AVX-NEXT:    vmovdqa 112(%rdi), %xmm3
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 144(%rdi), %xmm10
 ; AVX-NEXT:    vmovdqa 160(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2171,55 +2173,56 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpor %xmm0, %xmm8, %xmm2
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX-NEXT:    vpshufb %xmm15, %xmm4, %xmm1
-; AVX-NEXT:    vpor %xmm1, %xmm5, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm5, %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm15
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
-; AVX-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
+; AVX-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
 ; AVX-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm6, %xmm15, %xmm15
-; AVX-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
-; AVX-NEXT:    vpor %xmm10, %xmm15, %xmm10
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm6, %xmm15, %xmm15
-; AVX-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
-; AVX-NEXT:    vpor %xmm12, %xmm15, %xmm12
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm6, %xmm15, %xmm6
-; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX-NEXT:    vpor %xmm0, %xmm6, %xmm0
+; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm15 = [18446744073709551615,16777215]
+; AVX-NEXT:    vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
+; AVX-NEXT:    vpor %xmm0, %xmm10, %xmm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm6, %xmm10, %xmm6
+; AVX-NEXT:    vpshufb %xmm1, %xmm12, %xmm10
+; AVX-NEXT:    vpor %xmm6, %xmm10, %xmm6
+; AVX-NEXT:    vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm10, %xmm1
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX-NEXT:    vpshufb %xmm4, %xmm7, %xmm6
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm7, %xmm15, %xmm15
-; AVX-NEXT:    vpor %xmm6, %xmm15, %xmm6
+; AVX-NEXT:    vpshufb %xmm4, %xmm7, %xmm7
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
+; AVX-NEXT:    vpor %xmm7, %xmm12, %xmm7
 ; AVX-NEXT:    vpshufb %xmm4, %xmm11, %xmm11
-; AVX-NEXT:    vpshufb %xmm7, %xmm3, %xmm15
-; AVX-NEXT:    vpor %xmm15, %xmm11, %xmm11
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
+; AVX-NEXT:    vpor %xmm12, %xmm11, %xmm11
 ; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rsi)
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
+; AVX-NEXT:    vpor %xmm2, %xmm12, %xmm2
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
+; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa %xmm3, (%rsi)
 ; AVX-NEXT:    vmovdqa %xmm2, 48(%rsi)
 ; AVX-NEXT:    vmovdqa %xmm11, 32(%rsi)
-; AVX-NEXT:    vmovdqa %xmm6, 16(%rsi)
-; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
-; AVX-NEXT:    vmovdqa %xmm12, 48(%rdx)
-; AVX-NEXT:    vmovdqa %xmm10, 32(%rdx)
+; AVX-NEXT:    vmovdqa %xmm7, 16(%rsi)
+; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX-NEXT:    vmovdqa %xmm6, 48(%rdx)
+; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
 ; AVX-NEXT:    vmovdqa %xmm14, 16(%rdx)
 ; AVX-NEXT:    vmovdqa %xmm5, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm8, 48(%rcx)
 ; AVX-NEXT:    vmovdqa %xmm9, 32(%rcx)
 ; AVX-NEXT:    vmovdqa %xmm13, 16(%rcx)
+; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: load_i8_stride3_vf64:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index ac14f55e3f0ed..11e2509795ad3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -6617,8 +6617,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm10
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm19) | ymm7
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm26
 ; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm8
 ; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm11
@@ -6629,31 +6629,33 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm12
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
-; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm15
+; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm12
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
-; AVX512-FCP-NEXT:    vmovdqa 144(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (ymm12 & ~ymm16) | ymm11
+; AVX512-FCP-NEXT:    vmovdqa 144(%rdi), %xmm11
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm6
 ; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %xmm13
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm11, %xmm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm6, %xmm14, %xmm6
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm6, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
+; AVX512-FCP-NEXT:    vpandnq %zmm6, %zmm21, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (zmm10 & zmm21) | zmm6
 ; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm14
-; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm11
-; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm10
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm15
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm14 ^ (ymm15 & (ymm12 ^ ymm14))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm0, %xmm10, %xmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,4,9,14],zero,zero,zero,xmm15[2,7,12],zero,zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm0, %xmm15, %xmm0
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm19
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [8,9,10,11,12,21,22,23]
+; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm6, %zmm15
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm10, %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12))
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm6
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
@@ -6684,15 +6686,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm7, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm19) | ymm3
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
 ; AVX512-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm18
 ; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12))
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
@@ -6720,15 +6722,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
 ; AVX512-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm19
 ; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm12 ^ ymm14))
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
@@ -6757,16 +6759,16 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
 ; AVX512-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm14 ^ (ymm4 & (ymm12 ^ ymm14))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm3
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
 ; AVX512-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -6794,9 +6796,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm4
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -7027,8 +7029,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm19) | ymm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm11
@@ -7039,31 +7041,33 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm12
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 144(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (ymm12 & ~ymm16) | ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa 144(%rdi), %xmm11
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm11, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm14, %xmm6
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm6, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
+; AVX512DQ-FCP-NEXT:    vpandnq %zmm6, %zmm21, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = (zmm10 & zmm21) | zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm10
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm15
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm14 ^ (ymm15 & (ymm12 ^ ymm14))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,4,9,14],zero,zero,zero,xmm15[2,7,12],zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm15, %xmm0
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm19
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [8,9,10,11,12,21,22,23]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm6, %zmm15
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm10, %zmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12))
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
@@ -7094,15 +7098,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm19) | ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12))
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
@@ -7130,15 +7134,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm19
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm0
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm12 ^ ymm14))
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
@@ -7167,16 +7171,16 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm14 ^ (ymm4 & (ymm12 ^ ymm14))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -7204,9 +7208,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
@@ -7472,52 +7476,52 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    movl $127, %eax
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k4
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm8, %ymm7 {%k4}
-; AVX512BW-FCP-NEXT:    vmovdqa 144(%rdi), %xmm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
+; AVX512BW-FCP-NEXT:    vmovdqa 144(%rdi), %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11]
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpor %xmm8, %xmm12, %xmm8
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm9 {%k5}
-; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
 ; AVX512BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm8
 ; AVX512BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm7
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm20
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm12 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u],zero,zero,zero,xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,4,9,14],zero,zero,zero,xmm12[2,7,12],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [8,9,10,11,12,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm12, %zmm9, %zmm13
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm9, %zmm20
 ; AVX512BW-FCP-NEXT:    movw $10570, %ax # imm = 0x294A
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm12 {%k3}
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
 ; AVX512BW-FCP-NEXT:    movl $-2078212096, %eax # imm = 0x84210000
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm12 {%k6}
 ; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm12 {%k2}
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
 ; AVX512BW-FCP-NEXT:    movl $8456, %eax # imm = 0x2108
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %xmm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm12 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %xmm12
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vmovdqa 176(%rdi), %xmm13
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm17, %xmm16
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k4}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
 ; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm17, %xmm16
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm16, %zmm15
@@ -7551,12 +7555,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm15 {%k6}
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm12[2,7,12],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm18, %xmm4
 ; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
 ; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm15, %zmm4
@@ -7590,26 +7594,26 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm4 {%k5}
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm12[3,8,13],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm18, %xmm9
 ; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm4
 ; AVX512BW-FCP-NEXT:    movl $33554431, %eax # imm = 0x1FFFFFF
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k5
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm4 {%k5}
 ; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm4, %ymm9
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm10 {%k3}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero,xmm11[4,9,14]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm10, %ymm9 {%k4}
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k2}
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
@@ -7630,7 +7634,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm5 {%k2}
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
@@ -7916,52 +7920,52 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    movl $127, %eax
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm8, %ymm7 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 144(%rdi), %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 144(%rdi), %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm8, %xmm12, %xmm8
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm9 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u],zero,zero,zero,xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,4,9,14],zero,zero,zero,xmm12[2,7,12],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [8,9,10,11,12,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm12, %zmm9, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm9, %zmm20
 ; AVX512DQ-BW-FCP-NEXT:    movw $10570, %ax # imm = 0x294A
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm12 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    movl $-2078212096, %eax # imm = 0x84210000
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm12 {%k6}
 ; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    movl $8456, %eax # imm = 0x2108
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm12 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 176(%rdi), %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm17, %xmm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm17, %xmm16
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm16, %zmm15
@@ -7995,12 +7999,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm15 {%k6}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm12[2,7,12],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm18, %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm15, %zmm4
@@ -8034,26 +8038,26 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm4 {%k5}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm12[3,8,13],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm18, %xmm9
 ; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    movl $33554431, %eax # imm = 0x1FFFFFF
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm4 {%k5}
 ; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm4, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm10 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero,xmm11[4,9,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm10, %ymm9 {%k4}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
@@ -8074,7 +8078,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm5 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 5ab09194c5b83..489433aeaf16e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -3297,99 +3297,104 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
 ; AVX512BW-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    movw $-28382, %r11w # imm = 0x9122
-; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
-; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
-; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX512BW-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512BW-FCP-NEXT:    movw $-28382, %di # imm = 0x9122
+; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm6 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512BW-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [8,1,10,3,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpermd %zmm1, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
+; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; AVX512BW-FCP-NEXT:    movw $4644, %di # imm = 0x1224
 ; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [0,17,2,3,4,21,6,7]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
 ; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX512BW-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
 ; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm7, %xmm6 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm6, %xmm5 {%k1}
 ; AVX512BW-FCP-NEXT:    movw $8772, %di # imm = 0x2244
 ; AVX512BW-FCP-NEXT:    kmovd %edi, %k3
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm6 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm7 = [0,1,18,3,4,21,6,7]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm7, %zmm7
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
 ; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
-; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm8, %xmm7 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm7, %xmm6 {%k1}
 ; AVX512BW-FCP-NEXT:    movw $9288, %di # imm = 0x2448
 ; AVX512BW-FCP-NEXT:    kmovd %edi, %k4
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm7 {%k4}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [0,9,2,11]
+; AVX512BW-FCP-NEXT:    vpermd %zmm1, %zmm8, %zmm8
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm9, %xmm7 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm9 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm8, %xmm9 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm8 {%k3}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [16,1,2,19,4,5,6,23]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm10, %zmm10
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[0,7,14]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
 ; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm10, %xmm8 {%k1}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm9, %xmm9
-; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm9, %xmm10 {%k1}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm9, %xmm9
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm11, %xmm9 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k4}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k4}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [16,1,2,3,20,5,6,23]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm1
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm7, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm8, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm10, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm9, (%r10)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm1, (%rax)
+; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm0, %xmm2 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm7, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm9, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm8, (%r10)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -3497,99 +3502,104 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    movw $-28382, %r11w # imm = 0x9122
-; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-BW-FCP-NEXT:    movw $-28382, %di # imm = 0x9122
+; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [8,1,10,3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm1, %zmm5, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    movw $4644, %di # imm = 0x1224
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm6 = [0,17,2,3,4,21,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX512DQ-BW-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm7, %xmm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm6, %xmm5 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movw $8772, %di # imm = 0x2244
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k3
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm6 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm7 = [0,1,18,3,4,21,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm7, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm8, %xmm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm7, %xmm6 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movw $9288, %di # imm = 0x2448
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k4
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm7 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [0,9,2,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm1, %zmm8, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm9, %xmm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm9 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm8, %xmm9 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm8 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [16,1,2,19,4,5,6,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm10, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[0,7,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm10, %xmm8 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm9, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm9, %xmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm9, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm11, %xmm9 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [16,1,2,3,20,5,6,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm7, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm8, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm10, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm9, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm0, %xmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm7, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm9, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm8, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <112 x i8>, ptr %in.vec, align 64
@@ -5629,202 +5639,207 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm3
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm3 ^ ymm1))
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm4
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512-NEXT:    vpor %xmm4, %xmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm4
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
-; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm20
-; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm5
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm9
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 & (ymm6 ^ ymm7))
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm10
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm10, %xmm9, %xmm13
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX512-NEXT:    vmovdqa %ymm11, %ymm15
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm10
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm13 & mem)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u]
+; AVX512-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm5
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm5, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm19
+; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm6
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm7
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm2 ^ ymm4))
+; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm9, %xmm8, %xmm12
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm10
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm10 ^ ymm7))
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm8
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7,8,9],ymm8[10],ymm13[11,12],ymm8[13],ymm13[14,15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm12 & mem)
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm16 & (ymm8 ^ ymm12))
-; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm18
-; AVX512-NEXT:    vmovdqa %ymm11, %ymm12
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm3 ^ ymm2))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm16 & (ymm13 ^ ymm11))
+; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm18
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm3 ^ ymm1))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT:    vmovdqa %ymm13, %ymm12
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm7 ^ (ymm12 & (ymm6 ^ ymm7))
-; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm15, %xmm12, %xmm15
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm12
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm1 ^ ymm9))
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm15 & ~mem)
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm19
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm15, %xmm8, %xmm8
-; AVX512-NEXT:    vmovdqa %ymm13, %ymm15
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm9 ^ (ymm15 & (ymm1 ^ ymm9))
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT:    vmovdqa %ymm12, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm2 ^ ymm4))
+; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm13, %xmm11, %xmm13
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm7 ^ ymm10))
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5],ymm8[6],ymm11[7,8,9],ymm8[10],ymm11[11,12,13],ymm8[14],ymm11[15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ~mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm16 & (ymm11 ^ ymm14))
+; AVX512-NEXT:    vmovdqa %ymm12, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7,8,9,10],ymm13[11],ymm8[12,13],ymm13[14],ymm8[15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa %ymm0, %ymm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm4 ^ (ymm13 & (ymm2 ^ ymm4))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm14, %xmm13, %xmm13
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm2 ^ ymm3))
-; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm17) | ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm3 ^ (ymm15 & (ymm1 ^ ymm3))
+; AVX512-NEXT:    vextracti128 $1, %ymm15, %xmm8
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm8, %xmm14, %xmm8
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm12
-; AVX512-NEXT:    vpshufb %xmm12, %xmm5, %xmm14
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm20
-; AVX512-NEXT:    vmovdqa %ymm13, %ymm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm12
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm14
+; AVX512-NEXT:    vpshufb %xmm14, %xmm6, %xmm14
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm14[7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6,7],ymm13[8],ymm8[9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm19
+; AVX512-NEXT:    vmovdqa %ymm12, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm13
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm12, %xmm8, %xmm8
+; AVX512-NEXT:    vpor %xmm13, %xmm8, %xmm8
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512-NEXT:    vpor %xmm12, %xmm14, %xmm12
-; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,12]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512-NEXT:    vpor %xmm13, %xmm14, %xmm13
+; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512-NEXT:    vmovdqa %ymm11, %ymm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm14, %xmm8, %xmm8
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm14
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512-NEXT:    vmovdqa %ymm0, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm8[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2],ymm14[3],ymm8[4,5,6],ymm14[7,8],ymm8[9,10],ymm14[11],ymm8[12,13,14],ymm14[15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm4 ^ (ymm14 & (ymm2 ^ ymm4))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm14
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm21
+; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm20
 ; AVX512-NEXT:    vmovdqa %ymm0, %ymm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
 ; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm8
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm12, %xmm8, %xmm8
+; AVX512-NEXT:    vpor %xmm13, %xmm8, %xmm8
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512-NEXT:    vpor %xmm12, %xmm14, %xmm12
-; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512-NEXT:    vmovdqa %ymm13, %ymm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
-; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm14
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm14, %xmm8, %xmm8
-; AVX512-NEXT:    vmovdqa %ymm11, %ymm14
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT:    vmovdqa %ymm11, %ymm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[6,13]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512-NEXT:    vpor %xmm13, %xmm14, %xmm13
+; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm8[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2,3],ymm14[4],ymm8[5,6],ymm14[7,8],ymm8[9,10,11],ymm14[12],ymm8[13,14],ymm14[15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa %ymm12, %ymm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm4 ^ ymm2))
+; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm21
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
 ; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm8
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm12, %xmm8, %xmm8
+; AVX512-NEXT:    vpor %xmm13, %xmm8, %xmm8
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
-; AVX512-NEXT:    vpor %xmm12, %xmm15, %xmm12
-; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
-; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm15
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm15, %xmm8, %xmm8
-; AVX512-NEXT:    vmovdqa %ymm13, %ymm15
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm3 ^ ymm2))
-; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
-; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[0,7,14]
+; AVX512-NEXT:    vpor %xmm13, %xmm15, %xmm13
+; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512-NEXT:    vmovdqa %ymm12, %ymm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm10 ^ ymm7))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7,8],ymm15[9],ymm8[10,11],ymm15[12],ymm8[13,14,15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa %ymm0, %ymm15
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm2 ^ (ymm15 & (ymm4 ^ ymm2))
+; AVX512-NEXT:    vextracti128 $1, %ymm15, %xmm14
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm1 ^ (ymm12 & (ymm3 ^ ymm1))
+; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm1
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[u,u,4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[1,8,15]
+; AVX512-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm7 ^ ymm6))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm4
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm9 ^ ymm1))
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm1))
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm0 & (ymm10 ^ ymm7))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm2 ^ (ymm9 & (ymm4 ^ ymm2))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm17) | ymm0
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm18, (%rsi)
-; AVX512-NEXT:    vmovdqa64 %ymm19, (%rdx)
-; AVX512-NEXT:    vmovdqa64 %ymm20, (%rcx)
-; AVX512-NEXT:    vmovdqa64 %ymm21, (%r8)
-; AVX512-NEXT:    vmovdqa %ymm14, (%r9)
+; AVX512-NEXT:    vmovdqa %ymm11, (%rdx)
+; AVX512-NEXT:    vmovdqa64 %ymm19, (%rcx)
+; AVX512-NEXT:    vmovdqa64 %ymm20, (%r8)
+; AVX512-NEXT:    vmovdqa64 %ymm21, (%r9)
 ; AVX512-NEXT:    vmovdqa %ymm8, (%r10)
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
@@ -5835,198 +5850,202 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm4 ^ ymm3))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm4, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm10
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm11
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm7 ^ ymm1))
-; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm8
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm10 & mem)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm18
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm18 ^ ymm2))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,7,14],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm10
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm8 ^ ymm5))
+; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm10 & mem)
 ; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm16 & (ymm11 ^ ymm6))
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm18
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm3 ^ ymm2))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm10, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm9 ^ (ymm16 & (ymm6 ^ ymm9))
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm3 ^ (ymm9 & (ymm4 ^ ymm3))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6]
-; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm10, %ymm10
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm10
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm4 ^ ymm5))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3,4,5,6],ymm10[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm2 ^ (ymm9 & (ymm18 ^ ymm2))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm14
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm14, %xmm10, %xmm14
-; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm10
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm1 ^ ymm7))
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm14 & ~mem)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm16 & (ymm10 ^ ymm6))
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm14, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm14
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm1 ^ ymm7))
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm6 & ymm17)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm13, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm14, %xmm9, %xmm14
+; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm8 ^ (ymm9 & (ymm5 ^ ymm8))
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7,8,9],ymm13[10],ymm9[11,12,13],ymm13[14],ymm9[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm14 & ~mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm15 ^ (ymm16 & (ymm9 ^ ymm15))
+; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm13
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm5 ^ ymm8))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8,9,10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm18 ^ ymm2))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm16) | ymm13
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm4 ^ (ymm12 & (ymm3 ^ ymm4))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
-; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm13, %ymm12
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm19
-; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm13, %ymm11
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6,7],ymm14[8],ymm11[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm3 ^ ymm4))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm12, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa 208(%rdi), %xmm14
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vmovdqa 208(%rdi), %xmm13
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[5,12]
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %xmm11
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm17 & (ymm14 ^ ymm12))
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm12 & (ymm5 ^ ymm8))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm12[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1,2],ymm15[3],ymm12[4,5,6],ymm15[7,8],ymm12[9,10],ymm15[11],ymm12[12,13,14],ymm15[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm15
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm2 ^ (ymm15 & (ymm18 ^ ymm2))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm1, %xmm15, %xmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm16) | ymm12
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm3 ^ ymm4))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm1, %xmm12, %xmm1
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[6,13]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm12, %xmm15, %xmm12
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm17 & (ymm12 ^ ymm1))
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm5 ^ ymm8))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3],ymm15[4],ymm1[5,6],ymm15[7,8],ymm1[9,10,11],ymm15[12],ymm1[13,14],ymm15[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm15
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm18 ^ (ymm15 & (ymm2 ^ ymm18))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm14
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm16) | ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm3 ^ ymm4))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm1, %xmm14, %xmm1
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm13[0,7,14]
+; AVX512-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm17 & (ymm14 ^ ymm1))
+; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm5 ^ (ymm1 & (ymm8 ^ ymm5))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6,7,8],ymm15[9],ymm1[10,11],ymm15[12],ymm1[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm20
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm13, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm15
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm21
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
-; AVX512-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm15
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm7 ^ ymm1))
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm3 ^ ymm2))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
-; AVX512-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm18 ^ (ymm15 & (ymm2 ^ ymm18))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm12
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm12, %xmm15, %xmm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm16) | ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0],ymm14[1,2,3,4,5,6,7],ymm12[8],ymm14[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm4 ^ ymm3))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm13[1,8,15]
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm7 ^ ymm1))
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm17 & (ymm3 ^ ymm1))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm8 ^ ymm5))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, (%rdx)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm18 ^ (ymm7 & (ymm2 ^ ymm18))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm16) | ymm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rsi)
+; AVX512-FCP-NEXT:    vmovdqa %ymm9, (%rdx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, (%r8)
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, (%r9)
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%r10)
+; AVX512-FCP-NEXT:    vmovdqa %ymm15, (%r10)
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -6036,202 +6055,207 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm3
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm3 ^ ymm1))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm4
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm4, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm20
-; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm9
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 & (ymm6 ^ ymm7))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm10
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm10, %xmm9, %xmm13
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm15
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm13 & mem)
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm5
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm19
+; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm6
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm7
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm2 ^ ymm4))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm9, %xmm8, %xmm12
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm10
+; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm13
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm10 ^ ymm7))
+; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7,8,9],ymm8[10],ymm13[11,12],ymm8[13],ymm13[14,15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm12 & mem)
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm16 & (ymm8 ^ ymm12))
-; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm18
-; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm12
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm3 ^ ymm2))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm16 & (ymm13 ^ ymm11))
+; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm18
+; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm3 ^ ymm1))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm12
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm7 ^ (ymm12 & (ymm6 ^ ymm7))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm15, %xmm12, %xmm15
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm12
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm1 ^ ymm9))
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm15 & ~mem)
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm19
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm15, %xmm8, %xmm8
-; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm15
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm9 ^ (ymm15 & (ymm1 ^ ymm9))
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm2 ^ ymm4))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm13, %xmm11, %xmm13
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm7 ^ ymm10))
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5],ymm8[6],ymm11[7,8,9],ymm8[10],ymm11[11,12,13],ymm8[14],ymm11[15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ~mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm16 & (ymm11 ^ ymm14))
+; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7,8,9,10],ymm13[11],ymm8[12,13],ymm13[14],ymm8[15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm13
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm4 ^ (ymm13 & (ymm2 ^ ymm4))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm14, %xmm13, %xmm13
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm2 ^ ymm3))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm17) | ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm3 ^ (ymm15 & (ymm1 ^ ymm3))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm8
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm8, %xmm14, %xmm8
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm12
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm5, %xmm14
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm20
-; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm12
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm6, %xmm14
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6,7],ymm13[8],ymm8[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm19
+; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm13
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm12, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vpor %xmm13, %xmm8, %xmm8
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-NEXT:    vpor %xmm12, %xmm14, %xmm12
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,12]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-NEXT:    vpor %xmm13, %xmm14, %xmm13
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm14, %xmm8, %xmm8
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm14
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2],ymm14[3],ymm8[4,5,6],ymm14[7,8],ymm8[9,10],ymm14[11],ymm8[12,13,14],ymm14[15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm4 ^ (ymm14 & (ymm2 ^ ymm4))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm14
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm21
+; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm20
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm8
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm12, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vpor %xmm13, %xmm8, %xmm8
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-NEXT:    vpor %xmm12, %xmm14, %xmm12
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm14
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm14, %xmm8, %xmm8
-; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm14
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[6,13]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-NEXT:    vpor %xmm13, %xmm14, %xmm13
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2,3],ymm14[4],ymm8[5,6],ymm14[7,8],ymm8[9,10,11],ymm14[12],ymm8[13,14],ymm14[15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm4 ^ ymm2))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm21
+; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm8
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm12, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vpor %xmm13, %xmm8, %xmm8
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
-; AVX512DQ-NEXT:    vpor %xmm12, %xmm15, %xmm12
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm15
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm15, %xmm8, %xmm8
-; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm15
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm3 ^ ymm2))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[0,7,14]
+; AVX512DQ-NEXT:    vpor %xmm13, %xmm15, %xmm13
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm10 ^ ymm7))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7,8],ymm15[9],ymm8[10,11],ymm15[12],ymm8[13,14,15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm15
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm2 ^ (ymm15 & (ymm4 ^ ymm2))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm14
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm1 ^ (ymm12 & (ymm3 ^ ymm1))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[u,u,4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[1,8,15]
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm7 ^ ymm6))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm4
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm9 ^ ymm1))
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm1))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm0 & (ymm10 ^ ymm7))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm2 ^ (ymm9 & (ymm4 ^ ymm2))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm17) | ymm0
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm18, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, (%r8)
-; AVX512DQ-NEXT:    vmovdqa %ymm14, (%r9)
+; AVX512DQ-NEXT:    vmovdqa %ymm11, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, (%r8)
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, (%r9)
 ; AVX512DQ-NEXT:    vmovdqa %ymm8, (%r10)
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -6242,198 +6266,202 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm4 ^ ymm3))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm11
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm7 ^ ymm1))
-; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm10 & mem)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %ymm18
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm18 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,7,14],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm8 ^ ymm5))
+; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm10 & mem)
 ; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm16 & (ymm11 ^ ymm6))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm18
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm3 ^ ymm2))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm10, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm9 ^ (ymm16 & (ymm6 ^ ymm9))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm3 ^ (ymm9 & (ymm4 ^ ymm3))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm10, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm4 ^ ymm5))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm2 ^ (ymm9 & (ymm18 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm14
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm14, %xmm10, %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm10
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm1 ^ ymm7))
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm14 & ~mem)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm16 & (ymm10 ^ ymm6))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm14, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm14
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm1 ^ ymm7))
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm6 & ymm17)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm13, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm14, %xmm9, %xmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm8 ^ (ymm9 & (ymm5 ^ ymm8))
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7,8,9],ymm13[10],ymm9[11,12,13],ymm13[14],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm14 & ~mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm15 ^ (ymm16 & (ymm9 ^ ymm15))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm5 ^ ymm8))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8,9,10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm18 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm16) | ymm13
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm4 ^ (ymm12 & (ymm3 ^ ymm4))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm13, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm19
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6,7],ymm14[8],ymm11[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm3 ^ ymm4))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm12, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 208(%rdi), %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa 208(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[5,12]
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %xmm11
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm17 & (ymm14 ^ ymm12))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm12 & (ymm5 ^ ymm8))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm12[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1,2],ymm15[3],ymm12[4,5,6],ymm15[7,8],ymm12[9,10],ymm15[11],ymm12[12,13,14],ymm15[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm15
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm2 ^ (ymm15 & (ymm18 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm15, %xmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm16) | ymm12
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm3 ^ ymm4))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm12, %xmm1
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[6,13]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm12, %xmm15, %xmm12
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm17 & (ymm12 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm5 ^ ymm8))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3],ymm15[4],ymm1[5,6],ymm15[7,8],ymm1[9,10,11],ymm15[12],ymm1[13,14],ymm15[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm15
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm18 ^ (ymm15 & (ymm2 ^ ymm18))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm16) | ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm3 ^ ymm4))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm14, %xmm1
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm13[0,7,14]
+; AVX512DQ-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm17 & (ymm14 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm5 ^ (ymm1 & (ymm8 ^ ymm5))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6,7,8],ymm15[9],ymm1[10,11],ymm15[12],ymm1[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm13, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm15
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm21
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
-; AVX512DQ-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm15
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm7 ^ ymm1))
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm3 ^ ymm2))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
-; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm18 ^ (ymm15 & (ymm2 ^ ymm18))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm12, %xmm15, %xmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm16) | ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm12[0],ymm14[1,2,3,4,5,6,7],ymm12[8],ymm14[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm4 ^ ymm3))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm13[1,8,15]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm7 ^ ymm1))
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm17 & (ymm3 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm8 ^ ymm5))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm18 ^ (ymm7 & (ymm2 ^ ymm18))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm16) | ymm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%r10)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm15, (%r10)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -6442,186 +6470,194 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm5
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm11
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm12
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm10
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512BW-NEXT:    vmovdqa 96(%rdi), %ymm3
 ; AVX512BW-NEXT:    movw $-28382, %r11w # imm = 0x9122
 ; AVX512BW-NEXT:    kmovd %r11d, %k5
-; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm7
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm7, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    movw $992, %r11w # imm = 0x3E0
+; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm0 {%k5}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512BW-NEXT:    movw $9288, %r11w # imm = 0x2448
 ; AVX512BW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-NEXT:    vmovdqu16 %ymm6, %ymm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512BW-NEXT:    vpblendmw %ymm4, %ymm3, %ymm5 {%k1}
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    movw $992, %r11w # imm = 0x3E0
+; AVX512BW-NEXT:    kmovd %r11d, %k2
+; AVX512BW-NEXT:    vmovdqu16 %ymm5, %ymm0 {%k2}
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX512BW-NEXT:    movw $8772, %r11w # imm = 0x2244
-; AVX512BW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-NEXT:    vpblendmw %ymm7, %ymm6, %ymm8 {%k1}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
-; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm13
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm15
-; AVX512BW-NEXT:    vmovdqa 208(%rdi), %xmm9
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512BW-NEXT:    kmovd %r11d, %k2
+; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm5, %ymm7 {%k2}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm9
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm7
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm7, %xmm11
+; AVX512BW-NEXT:    vmovdqa 208(%rdi), %xmm8
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7]
 ; AVX512BW-NEXT:    movl $-524288, %edi # imm = 0xFFF80000
 ; AVX512BW-NEXT:    kmovd %edi, %k4
-; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm1 {%k4}
+; AVX512BW-NEXT:    vmovdqu8 %ymm9, %ymm0 {%k4}
 ; AVX512BW-NEXT:    movw $4644, %di # imm = 0x1224
-; AVX512BW-NEXT:    kmovd %edi, %k2
-; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    movl $511, %edi # imm = 0x1FF
-; AVX512BW-NEXT:    kmovd %edi, %k3
-; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k3}
-; AVX512BW-NEXT:    movw $9288, %di # imm = 0x2448
 ; AVX512BW-NEXT:    kmovd %edi, %k3
-; AVX512BW-NEXT:    vpblendmw %ymm7, %ymm6, %ymm13 {%k3}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
-; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k4}
-; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k1}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
+; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm9 {%k3}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm9, %xmm11
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm11, %xmm9, %xmm11
+; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm4, %ymm9 {%k5}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm9, %xmm12
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4,5],ymm12[6],ymm9[7,8,9],ymm12[10],ymm9[11,12,13],ymm12[14],ymm9[15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    movl $511, %edi # imm = 0x1FF
+; AVX512BW-NEXT:    kmovd %edi, %k6
+; AVX512BW-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k6}
+; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm5, %ymm11 {%k1}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512BW-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm4, %ymm11 {%k3}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
+; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm12 {%k2}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm13, %xmm12, %xmm12
 ; AVX512BW-NEXT:    movl $261632, %edi # imm = 0x3FE00
 ; AVX512BW-NEXT:    kmovd %edi, %k4
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm12 {%k5}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm15, %xmm12, %xmm12
-; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm9, %xmm14
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k2}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm11 {%k5}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,0,7,14],zero,zero,xmm11[3,10],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm13, %xmm11, %xmm11
+; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm4, %ymm11 {%k2}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4,5,6],ymm12[7,8],ymm11[9,10],ymm12[11],ymm11[12,13,14],ymm12[15]
+; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm12 {%k1}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm11 {%k3}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm13, %xmm11, %xmm11
+; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-NEXT:    vpor %xmm13, %xmm14, %xmm13
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512BW-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
 ; AVX512BW-NEXT:    kmovd %edi, %k5
-; AVX512BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm11 {%k5}
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm4, %ymm12 {%k1}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3],ymm13[4],ymm12[5,6],ymm13[7,8],ymm12[9,10,11],ymm13[12],ymm12[13,14],ymm13[15]
+; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm1, %ymm13 {%k3}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm12 {%k2}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm14, %xmm12, %xmm12
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
 ; AVX512BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
-; AVX512BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
+; AVX512BW-NEXT:    vmovdqu8 %ymm14, %ymm12 {%k5}
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vpblendmw %ymm4, %ymm3, %ymm13 {%k3}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13,14,15]
+; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm1, %ymm14 {%k2}
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu16 %ymm7, %ymm6 {%k2}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
-; AVX512BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k5}
-; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k3}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa %ymm1, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %ymm10, (%rdx)
-; AVX512BW-NEXT:    vmovdqa %ymm12, (%rcx)
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm13 {%k1}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
+; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
+; AVX512BW-NEXT:    vporq %xmm15, %xmm16, %xmm15
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vmovdqu8 %ymm15, %ymm13 {%k5}
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu16 %ymm4, %ymm3 {%k2}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
+; AVX512BW-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 {%k4} = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k3}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm2
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k5}
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT:    vmovdqa %ymm9, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %ymm10, (%rcx)
 ; AVX512BW-NEXT:    vmovdqa %ymm11, (%r8)
-; AVX512BW-NEXT:    vmovdqa %ymm5, (%r9)
-; AVX512BW-NEXT:    vmovdqa %ymm4, (%r10)
-; AVX512BW-NEXT:    vmovdqa %ymm0, (%rax)
+; AVX512BW-NEXT:    vmovdqa %ymm12, (%r9)
+; AVX512BW-NEXT:    vmovdqa %ymm13, (%r10)
+; AVX512BW-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -6629,182 +6665,182 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm5
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm9
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm10
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm8
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512BW-FCP-NEXT:    movw $-28382, %r11w # imm = 0x9122
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k5
 ; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm7
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,17,10,3,20,13,6,0,24,0,0,27,0,0,0,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movw $992, %r11w # imm = 0x3E0
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
 ; AVX512BW-FCP-NEXT:    movw $8772, %r11w # imm = 0x2244
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm6, %ymm11 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6]
-; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm13
-; AVX512BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,2,4,6]
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm7
+; AVX512BW-FCP-NEXT:    vpermd %ymm7, %ymm8, %ymm8
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
 ; AVX512BW-FCP-NEXT:    movl $-524288, %r11d # imm = 0xFFF80000
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k4
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm1 {%k4}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm6, %ymm1 {%k4}
 ; AVX512BW-FCP-NEXT:    movw $4644, %r11w # imm = 0x1224
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k2
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm11 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm6 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,17,10,3,0,21,14,7,24,0,0,0,28,0,0,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movl $511, %r11d # imm = 0x1FF
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm8 {%k3}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm8, %ymm6 {%k3}
 ; AVX512BW-FCP-NEXT:    movw $9288, %r11w # imm = 0x2448
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k3
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm6, %ymm11 {%k3}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6]
-; AVX512BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm8 {%k4}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm11 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm8 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,4,6]
+; AVX512BW-FCP-NEXT:    vpermd %ymm7, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm8, %ymm6 {%k4}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,0,18,11,4,21,14,7,0,25,0,0,28,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm9, %zmm9
 ; AVX512BW-FCP-NEXT:    movl $261632, %r11d # imm = 0x3FE00
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k4
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm10 {%k5}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm10, %xmm10
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6]
-; AVX512BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm11 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12]
-; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm9 {%k5}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,5,6]
+; AVX512BW-FCP-NEXT:    vpermd %ymm7, %ymm10, %ymm7
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm8 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm8
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm12, %xmm11
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512BW-FCP-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
 ; AVX512BW-FCP-NEXT:    kmovd %edi, %k5
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k5}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm11 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [8,1,18,11,4,0,22,15,0,25,0,0,0,29,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm12, %zmm12
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm11 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm13, %xmm12
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm12, %ymm11 {%k5}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm12 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [8,1,0,19,12,5,22,15,0,0,26,0,0,29,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm13, %zmm13
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm12 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
+; AVX512BW-FCP-NEXT:    vpor %xmm13, %xmm14, %xmm13
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm12 {%k5}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm13 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14]
-; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm6 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15]
-; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm11, %xmm7
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k5}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,0,23,0,0,26,0,0,0,30,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 {%k4} = ymm14[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm4 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
+; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm5, %ymm4 {%k5}
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k3}
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [16,9,2,0,20,13,6,23,0,0,0,27,0,0,30,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm3, %zmm0
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm10, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm9, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%r10)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm11, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm12, (%r10)
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -6813,186 +6849,194 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm4
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm5
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm11
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm12
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm10
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %ymm3
 ; AVX512DQ-BW-NEXT:    movw $-28382, %r11w # imm = 0x9122
 ; AVX512DQ-BW-NEXT:    kmovd %r11d, %k5
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm1, %xmm7
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    movw $992, %r11w # imm = 0x3E0
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm0 {%k5}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
+; AVX512DQ-BW-NEXT:    movw $9288, %r11w # imm = 0x2448
 ; AVX512DQ-BW-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm6, %ymm1 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm4, %ymm3, %ymm5 {%k1}
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    movw $992, %r11w # imm = 0x3E0
+; AVX512DQ-BW-NEXT:    kmovd %r11d, %k2
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm5, %ymm0 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX512DQ-BW-NEXT:    movw $8772, %r11w # imm = 0x2244
-; AVX512DQ-BW-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm7, %ymm6, %ymm8 {%k1}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm15
-; AVX512DQ-BW-NEXT:    vmovdqa 208(%rdi), %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-BW-NEXT:    kmovd %r11d, %k2
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm5, %ymm7 {%k2}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm9
+; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %xmm7
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm7, %xmm11
+; AVX512DQ-BW-NEXT:    vmovdqa 208(%rdi), %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7]
 ; AVX512DQ-BW-NEXT:    movl $-524288, %edi # imm = 0xFFF80000
 ; AVX512DQ-BW-NEXT:    kmovd %edi, %k4
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm13, %ymm1 {%k4}
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm9, %ymm0 {%k4}
 ; AVX512DQ-BW-NEXT:    movw $4644, %di # imm = 0x1224
-; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    movl $511, %edi # imm = 0x1FF
 ; AVX512DQ-BW-NEXT:    kmovd %edi, %k3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k3}
-; AVX512DQ-BW-NEXT:    movw $9288, %di # imm = 0x2448
-; AVX512DQ-BW-NEXT:    kmovd %edi, %k3
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm7, %ymm6, %ymm13 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k4}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm9 {%k3}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm9, %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm9, %xmm11
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm4, %ymm9 {%k5}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm9, %xmm12
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4,5],ymm12[6],ymm9[7,8,9],ymm12[10],ymm9[11,12,13],ymm12[14],ymm9[15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    movl $511, %edi # imm = 0x1FF
+; AVX512DQ-BW-NEXT:    kmovd %edi, %k6
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k6}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm5, %ymm11 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm4, %ymm11 {%k3}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm12 {%k2}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm13, %xmm12, %xmm12
 ; AVX512DQ-BW-NEXT:    movl $261632, %edi # imm = 0x3FE00
 ; AVX512DQ-BW-NEXT:    kmovd %edi, %k4
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm12 {%k5}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm12, %xmm12
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm9, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k2}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm11 {%k5}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,0,7,14],zero,zero,xmm11[3,10],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm13, %xmm11, %xmm11
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm4, %ymm11 {%k2}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4,5,6],ymm12[7,8],ymm11[9,10],ymm12[11],ymm11[12,13,14],ymm12[15]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm12 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm11 {%k3}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm11, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm13, %xmm11, %xmm11
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-NEXT:    vpor %xmm13, %xmm14, %xmm13
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512DQ-BW-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
 ; AVX512DQ-BW-NEXT:    kmovd %edi, %k5
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm13, %ymm11 {%k5}
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm4, %ymm12 {%k1}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3],ymm13[4],ymm12[5,6],ymm13[7,8],ymm12[9,10,11],ymm13[12],ymm12[13,14],ymm13[15]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm1, %ymm13 {%k3}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm12 {%k2}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm12, %xmm12
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
 ; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm14, %ymm12 {%k5}
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm4, %ymm3, %ymm13 {%k3}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm1, %ymm14 {%k2}
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm13 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
+; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
-; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm7, %ymm6 {%k2}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
-; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k5}
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm10, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm12, (%rcx)
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
+; AVX512DQ-BW-NEXT:    vporq %xmm15, %xmm16, %xmm15
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm15, %ymm13 {%k5}
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm4, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 {%k4} = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k3}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k5}
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm9, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm10, (%rcx)
 ; AVX512DQ-BW-NEXT:    vmovdqa %ymm11, (%r8)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm5, (%r9)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%r10)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm12, (%r9)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm13, (%r10)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -7000,182 +7044,182 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512DQ-BW-FCP-NEXT:    movw $-28382, %r11w # imm = 0x9122
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k5
 ; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,17,10,3,20,13,6,0,24,0,0,27,0,0,0,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movw $992, %r11w # imm = 0x3E0
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
 ; AVX512DQ-BW-FCP-NEXT:    movw $8772, %r11w # imm = 0x2244
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm6, %ymm11 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,2,4,6]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm7, %ymm8, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
 ; AVX512DQ-BW-FCP-NEXT:    movl $-524288, %r11d # imm = 0xFFF80000
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm1 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm6, %ymm1 {%k4}
 ; AVX512DQ-BW-FCP-NEXT:    movw $4644, %r11w # imm = 0x1224
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k2
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm11 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm6 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,17,10,3,0,21,14,7,24,0,0,0,28,0,0,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movl $511, %r11d # imm = 0x1FF
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm8 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm8, %ymm6 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    movw $9288, %r11w # imm = 0x2448
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k3
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm6, %ymm11 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm8 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm11 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm8 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,4,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm7, %ymm9, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm8, %ymm6 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,0,18,11,4,21,14,7,0,25,0,0,28,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm9, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    movl $261632, %r11d # imm = 0x3FE00
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k4
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm10 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm11 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm9 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,5,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm7, %ymm10, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm12, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512DQ-BW-FCP-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm11 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [8,1,18,11,4,0,22,15,0,25,0,0,0,29,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm12, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm11 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm13, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm12, %ymm11 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [8,1,0,19,12,5,22,15,0,0,26,0,0,29,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm13, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm12 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm13, %xmm14, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm12 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm6 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm11, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,0,23,0,0,26,0,0,0,30,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 {%k4} = ymm14[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm4 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm5, %ymm4 {%k5}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [16,9,2,0,20,13,6,23,0,0,0,27,0,0,30,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm3, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm10, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm9, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm11, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm12, (%r10)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
@@ -11698,1635 +11742,1628 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512-LABEL: load_i8_stride7_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm13
-; AVX512-NEXT:    vmovdqa64 64(%rdi), %ymm31
-; AVX512-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm24
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm12 ^ ymm13))
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm18
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm13
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm12 ^ (ymm1 & (ymm18 ^ ymm12))
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vmovdqa64 96(%rdi), %ymm19
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm19 ^ ymm31))
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm11
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm21
-; AVX512-NEXT:    vmovdqa64 160(%rdi), %ymm29
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm21 ^ (ymm1 & (ymm29 ^ ymm21))
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm10
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13))
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm7
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm1 & mem)
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm30
+; AVX512-NEXT:    vmovdqa64 160(%rdi), %ymm31
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm30 ^ (ymm1 & (ymm31 ^ ymm30))
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX512-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm27
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm20
-; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm10
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm4
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm22
+; AVX512-NEXT:    vpor %xmm11, %xmm14, %xmm11
+; AVX512-NEXT:    vinserti32x4 $2, %xmm11, %zmm1, %zmm11
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm26 & (zmm11 ^ zmm5))
+; AVX512-NEXT:    vmovdqa64 288(%rdi), %ymm19
+; AVX512-NEXT:    vmovdqa64 256(%rdi), %ymm17
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm19 ^ (ymm5 & (ymm17 ^ ymm19))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u]
+; AVX512-NEXT:    vpor %xmm5, %xmm14, %xmm14
+; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm6
+; AVX512-NEXT:    vmovdqa64 320(%rdi), %ymm16
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm15
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm6 ^ (ymm15 & (ymm16 ^ ymm6))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm15[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm15[0,1],ymm2[2],ymm15[3,4,5],ymm2[6],ymm15[7,8,9],ymm2[10],ymm15[11,12,13],ymm2[14],ymm15[15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm14 & ymm27)
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT:    vmovdqa64 416(%rdi), %ymm20
+; AVX512-NEXT:    vmovdqa64 384(%rdi), %ymm21
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm20 ^ (ymm14 & (ymm21 ^ ymm20))
+; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero
+; AVX512-NEXT:    vpor %xmm3, %xmm14, %xmm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm6
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm5, %xmm7, %xmm5
-; AVX512-NEXT:    vinserti32x4 $2, %xmm5, %zmm1, %zmm22
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm4 & (zmm22 ^ zmm2))
-; AVX512-NEXT:    vmovdqa64 288(%rdi), %ymm18
-; AVX512-NEXT:    vmovdqa64 256(%rdi), %ymm16
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
-; AVX512-NEXT:    vpor %xmm5, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqa64 352(%rdi), %ymm17
-; AVX512-NEXT:    vmovdqa64 320(%rdi), %ymm28
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm7
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm28 ^ ymm17))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm2 & ymm23)
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm12 ^ ymm13))
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm15
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm2))
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2
+; AVX512-NEXT:    vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm29 & (zmm2 ^ zmm11))
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm18 ^ ymm12))
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm2, %xmm15, %xmm2
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm15
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm5
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm31 ^ ymm19))
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm2 & ~mem)
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm29 ^ ymm21))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm13 ^ ymm10))
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4,5],ymm7[6],ymm3[7,8,9],ymm7[10],ymm3[11,12,13],ymm7[14],ymm3[15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ~mem)
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm31 ^ ymm30))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm25
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm26
-; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm1
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm24
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm4 & (zmm24 ^ zmm15))
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm31 ^ ymm19))
-; AVX512-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm15 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm15)
-; AVX512-NEXT:    vmovdqa %ymm5, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29))
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm3
-; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-NEXT:    vpor %xmm2, %xmm11, %xmm2
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm1
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm5
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm11, %xmm14, %xmm11
+; AVX512-NEXT:    vinserti32x4 $2, %xmm11, %zmm2, %zmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm26 & (zmm2 ^ zmm3))
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm19 ^ ymm17))
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm11
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm11, %xmm3
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm16 ^ ymm6))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7,8,9,10],ymm14[11],ymm11[12,13],ymm14[14],ymm11[15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm3 & ymm27)
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm20 ^ (ymm3 & (ymm21 ^ ymm20))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
+; AVX512-NEXT:    vpor %xmm3, %xmm14, %xmm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm11))
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm26
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm29 & (zmm26 ^ zmm2))
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm18 ^ ymm12))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm13 ^ ymm10))
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8,9,10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm28 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm28)
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm30 ^ ymm31))
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm11
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[1,8,15,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
+; AVX512-NEXT:    vpor %xmm2, %xmm11, %xmm2
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm6, %xmm3
-; AVX512-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm25
-; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm20 & (zmm25 ^ zmm2))
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm31 ^ ymm19))
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm15)
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm11, %xmm14, %xmm11
+; AVX512-NEXT:    vinserti32x4 $2, %xmm11, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm25 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm25 & (zmm2 ^ zmm3))
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm19 ^ ymm17))
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm11
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm11, %xmm3
 ; AVX512-NEXT:    vmovdqa %ymm15, %ymm11
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29))
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm6
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512-NEXT:    vmovdqa %xmm5, %xmm10
-; AVX512-NEXT:    vpor %xmm6, %xmm15, %xmm6
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm0))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm0, %xmm15, %xmm0
-; AVX512-NEXT:    vmovdqa64 416(%rdi), %ymm26
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm6, %zmm30
-; AVX512-NEXT:    vmovdqa64 384(%rdi), %ymm27
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm3 ^ (zmm20 & (zmm30 ^ zmm3))
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero
-; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm8))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm20
-; AVX512-NEXT:    vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm8 & (zmm20 ^ zmm22))
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm18 ^ ymm16))
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm11 & (ymm6 ^ ymm16))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2],ymm14[3],ymm11[4,5,6],ymm14[7,8],ymm11[9,10],ymm14[11],ymm11[12,13,14],ymm14[15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm3 & ymm27)
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm20 ^ (ymm3 & (ymm21 ^ ymm20))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
+; AVX512-NEXT:    vpor %xmm3, %xmm14, %xmm3
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm11))
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm22
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm2))
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm18 ^ ymm12))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512-NEXT:    vmovdqa %ymm9, %ymm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm28 ^ ymm17))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm23)
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10]
-; AVX512-NEXT:    vpor %xmm6, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm3))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm22
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm8 & (zmm22 ^ zmm24))
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm18 ^ ymm16))
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm28 ^ (ymm3 & (ymm17 ^ ymm28))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4,5,6],ymm6[7,8],ymm3[9,10],ymm6[11],ymm3[12,13,14],ymm6[15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm23)
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11]
-; AVX512-NEXT:    vpor %xmm6, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm3))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm24
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm8 & (zmm24 ^ zmm25))
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm17 ^ ymm28))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm13 ^ ymm10))
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5,6],ymm7[7,8],ymm3[9,10],ymm7[11],ymm3[12,13,14],ymm7[15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm28)
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm30 ^ ymm31))
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm7, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm4, %xmm11
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512-NEXT:    vpor %xmm11, %xmm14, %xmm11
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 ^ (mem & (ymm11 ^ ymm2))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm0[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm2, %xmm14, %xmm2
+; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm11, %zmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm25 & (zmm2 ^ zmm3))
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm6 ^ ymm16))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm3[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3],ymm11[4],ymm3[5,6],ymm11[7,8],ymm3[9,10,11],ymm11[12],ymm3[13,14],ymm11[15]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm3
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm19 ^ ymm17))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm7
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u]
+; AVX512-NEXT:    vpor %xmm7, %xmm11, %xmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = (ymm7 & ymm27) | ymm3
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm21 ^ (ymm3 & (ymm20 ^ ymm21))
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm11
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
+; AVX512-NEXT:    vpor %xmm3, %xmm11, %xmm3
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm7))
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm25
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm29 & (zmm25 ^ zmm2))
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm6 ^ ymm16))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm17 ^ ymm19))
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
+; AVX512-NEXT:    vpor %xmm7, %xmm3, %xmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm27) | ymm2
 ; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm18 ^ ymm16))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm20 ^ ymm21))
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
+; AVX512-NEXT:    vpor %xmm7, %xmm2, %xmm2
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm23 & (ymm7 ^ ymm3))
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm16 ^ ymm6))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm17 ^ ymm19))
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm11
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm11, %xmm3
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm29
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm2
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm20 ^ ymm21))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm23) | ymm0
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero
-; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm2))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm25
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm8 & (zmm25 ^ zmm30))
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm17 ^ ymm28))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-NEXT:    vmovdqa %ymm4, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
+; AVX512-NEXT:    vpor %xmm2, %xmm11, %xmm2
+; AVX512-NEXT:    vinserti32x4 $1, %xmm2, %ymm0, %ymm27
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm27 = ymm27 ^ (ymm23 & (ymm27 ^ ymm3))
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm12 ^ ymm18))
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm23) | ymm0
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm30
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm1 & (ymm30 ^ ymm2))
-; AVX512-NEXT:    vmovdqa %ymm4, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm28 ^ ymm17))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18))
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm0
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
-; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm23
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm23 = ymm23 ^ (ymm1 & (ymm23 ^ ymm2))
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12))
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm27 ^ ymm26))
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm13 ^ ymm12))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512-NEXT:    vporq %xmm3, %xmm2, %xmm23
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm13 ^ ymm10))
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm24 = ymm20 ^ (ymm24 & (ymm21 ^ ymm20))
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm12 ^ ymm18))
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm3
-; AVX512-NEXT:    vmovdqa %ymm9, %ymm15
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm13 ^ ymm12))
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm31 ^ ymm19))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm6, %xmm9, %xmm6
-; AVX512-NEXT:    vmovdqa %ymm14, %ymm12
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm16 = ymm18 ^ (ymm14 & (ymm16 ^ ymm18))
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm9
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm19 ^ ymm31))
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm19 ^ ymm31))
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3],ymm8[4],ymm3[5,6],ymm8[7,8],ymm3[9,10,11],ymm8[12],ymm3[13,14],ymm8[15]
+; AVX512-NEXT:    vporq %xmm3, %xmm2, %xmm20
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm17 = ymm19 ^ (ymm9 & (ymm17 ^ ymm19))
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13))
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm13 ^ (ymm9 & (ymm10 ^ ymm13))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2,3],ymm3[4],ymm11[5,6],ymm3[7,8],ymm11[9,10,11],ymm3[12],ymm11[13,14],ymm3[15]
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm18 ^ (ymm8 & (ymm12 ^ ymm18))
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa %ymm11, %ymm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm11)
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm2 & ymm1)
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3,4],ymm8[5],ymm14[6,7,8],ymm8[9],ymm14[10,11,12],ymm8[13],ymm14[14,15]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm1)
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm29 ^ (ymm12 & (ymm21 ^ ymm29))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm10, %xmm13
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512-NEXT:    vpor %xmm2, %xmm6, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm23 = (ymm23 & ymm28) | ymm3
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm20 = (ymm20 & ymm28) | ymm2
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2,3,4],ymm2[5],ymm9[6,7,8],ymm2[9],ymm9[10,11,12],ymm2[13],ymm9[14,15]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm8[6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm8, %xmm10
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm28) | ymm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm30 ^ ymm31))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm14[u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512-NEXT:    vpor %xmm3, %xmm8, %xmm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0))
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm28 = ymm17 ^ (ymm7 & (ymm28 ^ ymm17))
-; AVX512-NEXT:    vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm29 ^ (ymm15 & (ymm21 ^ ymm29))
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm21 ^ (ymm7 & (ymm29 ^ ymm21))
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm10, %xmm8, %xmm0
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm5 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm5 & (ymm3 ^ ymm2))
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm16 = ymm6 ^ (ymm15 & (ymm16 ^ ymm6))
+; AVX512-NEXT:    vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm30 ^ ymm31))
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm30 ^ (ymm15 & (ymm31 ^ ymm30))
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm14[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm3))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm15, %xmm6
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm6, %xmm3
-; AVX512-NEXT:    vpshufb %xmm5, %xmm12, %xmm6
-; AVX512-NEXT:    vmovdqa %xmm12, %xmm15
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512-NEXT:    vpor %xmm6, %xmm12, %xmm6
+; AVX512-NEXT:    vpshufb %xmm9, %xmm14, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm8
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm2 & (zmm8 ^ zmm23))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm11
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm11, %xmm3
+; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm6
+; AVX512-NEXT:    vpshufb %xmm6, %xmm4, %xmm11
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512-NEXT:    vpor %xmm12, %xmm11, %xmm11
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm1 & (ymm6 ^ ymm3))
-; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm3
+; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm5 & (ymm11 ^ ymm3))
+; AVX512-NEXT:    vextracti128 $1, %ymm15, %xmm3
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm7, %xmm3
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm15, %xmm12
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512-NEXT:    vpor %xmm13, %xmm12, %xmm12
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm12, %xmm3
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm4, %xmm6
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512-NEXT:    vpor %xmm4, %xmm6, %xmm4
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm1 & (ymm12 ^ ymm3))
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm6, %zmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm11))
-; AVX512-NEXT:    vpshufb %xmm10, %xmm14, %xmm6
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm12, %zmm6
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm9))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm0
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm5 & (ymm4 ^ ymm3))
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm11, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm2 & (zmm3 ^ zmm20))
+; AVX512-NEXT:    vpshufb %xmm9, %xmm0, %xmm6
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm2 & (zmm4 ^ zmm10))
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm2
 ; AVX512-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
-; AVX512-NEXT:    vinserti64x4 $1, %ymm23, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
-; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm1
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm28[2,3,0,1]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
-; AVX512-NEXT:    vextracti32x4 $1, %ymm16, %xmm1
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm8 {%k1}
+; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k1}
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm16[2,3,0,1]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u]
+; AVX512-NEXT:    vextracti32x4 $1, %ymm17, %xmm1
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
 ; AVX512-NEXT:    vpor %xmm5, %xmm1, %xmm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
-; AVX512-NEXT:    vpor %xmm0, %xmm4, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512-NEXT:    vextracti32x4 $1, %ymm24, %xmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero
+; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm6 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm20, (%rsi)
-; AVX512-NEXT:    vmovdqa64 %zmm22, (%rdx)
-; AVX512-NEXT:    vmovdqa64 %zmm24, (%rcx)
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512-NEXT:    vmovdqa64 %zmm26, (%rdx)
+; AVX512-NEXT:    vmovdqa64 %zmm22, (%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm25, (%r8)
-; AVX512-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512-NEXT:    vmovdqa64 %zmm8, (%r9)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm4, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i8_stride7_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm20
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm27
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm20 ^ ymm12))
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm31
+; AVX512-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm29
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm31 ^ ymm29))
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm31
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm1
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm31 ^ ymm27))
-; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm28
-; AVX512-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm30
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm30 ^ ymm28))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm12 ^ ymm10))
+; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm26
+; AVX512-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm27
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm26 ^ (ymm1 & (ymm27 ^ ymm26))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm4, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm14
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
 ; AVX512-FCP-NEXT:    vmovdqa 240(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm2, %xmm7, %xmm2
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm21 & (zmm8 ^ zmm1))
-; AVX512-FCP-NEXT:    vmovdqa64 288(%rdi), %ymm16
-; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm11
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm11 ^ ymm16))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm14
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm5, %xmm9, %xmm5
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm1, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm19 & (zmm5 ^ zmm2))
+; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm15
+; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm1 ^ ymm15))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm2, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm30
 ; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm14 ^ (ymm7 & (ymm2 ^ ymm14))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm7[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7,8,9],ymm13[10],ymm7[11,12,13],ymm13[14],ymm7[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm0 & ymm26)
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa64 416(%rdi), %ymm17
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm30 ^ (ymm11 & (ymm2 ^ ymm30))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4,5],ymm13[6],ymm11[7,8,9],ymm13[10],ymm11[11,12,13],ymm13[14],ymm11[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm9 & ymm25)
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa64 416(%rdi), %ymm16
 ; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm18
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm15
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11],zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm15, %xmm7
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm23 & (ymm15 ^ ymm13))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm8))
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm20 ^ ymm12))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm13
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm13, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm13
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm27 ^ ymm31))
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7,8,9],ymm6[10],ymm13[11,12,13],ymm6[14],ymm13[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ~mem)
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm30 ^ ymm28))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm15, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm15, %ymm15
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm15[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm15, %xmm7
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm8, %zmm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm21 & (zmm7 ^ zmm13))
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm11 ^ (ymm8 & (ymm16 ^ ymm11))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm13
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u]
-; AVX512-FCP-NEXT:    vpor %xmm13, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm13
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26)
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10]
-; AVX512-FCP-NEXT:    vpor %xmm15, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm23 & (ymm8 ^ ymm13))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm24
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7))
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm20 ^ ymm12))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm16 ^ (ymm9 & (ymm18 ^ ymm16))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm7
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm9[4,11],zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm7
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm20 & (ymm7 ^ ymm11))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
+; AVX512-FCP-NEXT:    vpmovsxwd {{.*#+}} zmm28 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm28 & (zmm7 ^ zmm5))
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm29 ^ (ymm5 & (ymm31 ^ ymm29))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm10 ^ ymm12))
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm3[2],ymm7[3,4,5],ymm3[6],ymm7[7,8,9],ymm3[10],ymm7[11,12,13],ymm3[14],ymm7[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm5 & ~mem)
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm26 ^ (ymm5 & (ymm27 ^ ymm26))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm5, %xmm9, %xmm5
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,4,6]
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm19 & (zmm5 ^ zmm7))
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm15 ^ ymm1))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u]
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm30 ^ (ymm9 & (ymm2 ^ ymm30))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm7 & ymm25)
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm16 ^ (ymm7 & (ymm18 ^ ymm16))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm25)
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm13
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm7
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10]
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm11, %xmm7
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm13, %ymm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm7
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm22 & (zmm3 ^ zmm8))
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm11 ^ (ymm7 & (ymm16 ^ ymm11))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm20 & (ymm7 ^ ymm9))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm23
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm28 & (zmm23 ^ zmm5))
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm29 ^ (ymm5 & (ymm31 ^ ymm29))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm10 ^ ymm12))
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8,9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm22 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm5 & ymm22)
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm27 ^ (ymm5 & (ymm26 ^ ymm27))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm9
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm5, %xmm9, %xmm5
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,5,6]
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm21 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm7 ^ (zmm21 & (zmm5 ^ zmm7))
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm15 ^ ymm1))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm14 ^ ymm2))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3],ymm8[4,5,6],ymm13[7,8],ymm8[9,10],ymm13[11],ymm8[12,13,14],ymm13[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm26)
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm2 ^ (ymm9 & (ymm30 ^ ymm2))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2],ymm11[3],ymm9[4,5,6],ymm11[7,8],ymm9[9,10],ymm11[11],ymm9[12,13,14],ymm11[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm7 & ymm25)
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm16 ^ (ymm7 & (ymm18 ^ ymm16))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm7
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm11, %xmm7
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm23 & (ymm7 ^ ymm8))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm21
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm3))
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm20 ^ ymm12))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5,6],ymm6[7,8],ymm7[9,10],ymm6[11],ymm7[12,13,14],ymm6[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm25)
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm20 & (ymm7 ^ ymm9))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm19
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm5))
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm29 ^ (ymm5 & (ymm31 ^ ymm29))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm10 ^ ymm12))
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5,6],ymm3[7,8],ymm7[9,10],ymm3[11],ymm7[12,13,14],ymm3[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm5 & ymm22)
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm27 ^ (ymm3 & (ymm26 ^ ymm27))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm3, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa 208(%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %xmm15
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm3))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa 208(%rdi), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm11
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %xmm14
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm24 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm24 & (ymm3 ^ ymm5))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm22 & (zmm0 ^ zmm7))
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6],ymm4[7,8],ymm3[9,10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm4
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm16 ^ ymm11))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm4))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm22
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm0))
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm14 ^ ymm2))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16))
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm4, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm21 & (zmm3 ^ zmm7))
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm30 ^ ymm2))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm15 ^ ymm1))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm25) | ymm4
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm18 ^ (ymm4 & (ymm16 ^ ymm18))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm20 & (ymm4 ^ ymm5))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm21
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm28 & (zmm21 ^ zmm3))
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm30 ^ ymm2))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm28
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm15 ^ (ymm4 & (ymm1 ^ ymm15))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u]
+; AVX512-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm25) | ymm3
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm16 ^ ymm18))
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
-; AVX512-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm26) | ymm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm4, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm29
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm23 & (ymm29 ^ ymm3))
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm2 ^ ymm14))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm20 & (ymm4 ^ ymm5))
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm2 ^ ymm30))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm15 ^ (ymm5 & (ymm1 ^ ymm15))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm7
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ~mem) | ymm3
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm16 ^ ymm18))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14]
 ; AVX512-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
+; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm3, %ymm0, %ymm25
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm25 = ymm25 ^ (ymm20 & (ymm25 ^ ymm5))
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm12 ^ (ymm5 & (ymm10 ^ ymm12))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm17 = ymm16 ^ (ymm17 & (ymm18 ^ ymm16))
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm12 ^ ymm10))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm15 ^ (ymm8 & (ymm1 ^ ymm15))
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm12 ^ ymm10))
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm29 ^ ymm31))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm10
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm10
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm31 ^ (ymm3 & (ymm29 ^ ymm31))
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm31 ^ (ymm0 & (ymm29 ^ ymm31))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm7
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4],ymm5[5,6],ymm3[7,8],ymm5[9,10,11],ymm3[12],ymm5[13,14],ymm3[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm22) | ymm3
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7,8],ymm3[9],ymm11[10,11],ymm3[12],ymm11[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = (ymm7 & ymm22) | ymm3
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm8[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7,8],ymm3[9],ymm8[10,11,12],ymm3[13],ymm8[14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm26
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm23 & (ymm26 ^ ymm3))
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm13
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm20 ^ (ymm13 & (ymm12 ^ ymm20))
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17))
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm12 ^ ymm20))
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm20 ^ (ymm10 & (ymm12 ^ ymm20))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm3, %xmm12, %xmm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm12
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm0, %xmm12, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm12
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm9 & (ymm11 ^ ymm16))
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm13
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm31 ^ ymm27))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[4,11],zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm4, %xmm10, %xmm4
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6],ymm6[7,8],ymm7[9,10,11],ymm6[12],ymm7[13,14],ymm6[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm3 & ymm25)
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm0 & ymm25)
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7,8],ymm6[9],ymm9[10,11,12],ymm6[13],ymm9[14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm25)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm1 & (ymm2 ^ ymm14))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm28 ^ ymm30))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm5, %xmm0, %xmm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm22) | ymm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm13 & (ymm2 ^ ymm30))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm26 ^ ymm27))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm27 ^ (ymm12 & (ymm26 ^ ymm27))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm26 ^ (ymm13 & (ymm27 ^ ymm26))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm3
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm14[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm8, %xmm3
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm9 & (ymm3 ^ ymm0))
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm0, %xmm4, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm12, %xmm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm24 & (ymm3 ^ ymm0))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm8
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm0, %xmm8, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm14[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm9 & (ymm4 ^ ymm0))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm24 & (ymm6 ^ ymm0))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm0, %xmm8, %xmm8
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm9 & (ymm5 ^ ymm1))
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,2,4,6,0,0,0,0]
-; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm10))
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,3,4,6,0,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm3 & (zmm4 ^ zmm7))
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm24 & (ymm9 ^ ymm8))
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [1,2,4,6,0,0,0,0]
+; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm8, %ymm8
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm10))
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm8 & (zmm6 ^ zmm7))
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,3,5,6,0,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm7, %ymm7
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm5, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm6))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm9, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm8 & (zmm7 ^ zmm5))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
 ; AVX512-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm4, %zmm3 {%k1}
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm4, %zmm6 {%k1}
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & mem) | ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm2
+; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero
-; AVX512-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero
+; AVX512-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm5 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i8_stride7_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $24, %rsp
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm13
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %ymm31
-; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm12 ^ ymm13))
+; AVX512DQ-NEXT:    subq $56, %rsp
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm27 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %ymm25
+; AVX512DQ-NEXT:    vmovdqa64 32(%rdi), %ymm18
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm13
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm25 ^ ymm18))
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vmovdqa64 96(%rdi), %ymm28
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm28 ^ ymm31))
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm25
-; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm4
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm4 ^ ymm25))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm10
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13))
+; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm7
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm1 & mem)
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm30
+; AVX512DQ-NEXT:    vmovdqa64 160(%rdi), %ymm31
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm30 ^ (ymm1 & (ymm31 ^ ymm30))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm0, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm30
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm20
-; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm10
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm23
+; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm26
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vmovdqa 240(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm6, %zmm1, %zmm22
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm19 & (zmm22 ^ zmm2))
-; AVX512DQ-NEXT:    vmovdqa64 288(%rdi), %ymm18
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm22
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm14, %xmm11
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm11, %zmm1, %zmm14
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm4 & (zmm14 ^ zmm6))
+; AVX512DQ-NEXT:    vmovdqa64 288(%rdi), %ymm19
 ; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %ymm17
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
-; AVX512DQ-NEXT:    vpor %xmm6, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 352(%rdi), %ymm21
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm19 ^ (ymm6 & (ymm17 ^ ymm19))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm6
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u]
+; AVX512DQ-NEXT:    vpor %xmm6, %xmm15, %xmm15
+; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm12
 ; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %ymm16
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm7
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm21 ^ (ymm7 & (ymm16 ^ ymm21))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm16 ^ ymm12))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0,1],ymm2[2],ymm9[3,4,5],ymm2[6],ymm9[7,8,9],ymm2[10],ymm9[11,12,13],ymm2[14],ymm9[15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm2 & ymm24)
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm12 ^ ymm13))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm15
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm15 & ymm24)
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-NEXT:    vmovdqa64 416(%rdi), %ymm20
+; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %ymm21
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm20 ^ (ymm9 & (ymm21 ^ ymm20))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm11
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm9[4,11],zero,zero
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm6 & (ymm9 ^ ymm2))
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm14))
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm25 ^ ymm18))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm9
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm2, %xmm15, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm15
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm29
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm28 ^ (ymm15 & (ymm31 ^ ymm28))
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm2 & ~mem)
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm6
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm4 ^ ymm25))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm9, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm10 ^ (ymm9 & (ymm13 ^ ymm10))
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm7[2],ymm9[3,4,5],ymm7[6],ymm9[7,8,9],ymm7[10],ymm9[11,12,13],ymm7[14],ymm9[15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm2 & ~mem)
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm31 ^ ymm30))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm4
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm26
-; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm1
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm23
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm19 & (zmm23 ^ zmm15))
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28))
-; AVX512DQ-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm27 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm27)
-; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm15
-; AVX512DQ-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm0 & (ymm25 ^ ymm6))
-; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm19
-; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm6
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm11, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm5, %xmm3
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm25
-; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm20 & (zmm25 ^ zmm2))
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28))
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm27)
-; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm11
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm19 ^ (ymm0 & (ymm15 ^ ymm19))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-NEXT:    vpor %xmm5, %xmm15, %xmm5
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm0))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm0, %xmm15, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 416(%rdi), %ymm26
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm30
-; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %ymm27
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm3 ^ (zmm20 & (zmm30 ^ zmm3))
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm8))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm20
-; AVX512DQ-NEXT:    vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm8 & (zmm20 ^ zmm22))
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm18 ^ ymm17))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm3
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm21 ^ (ymm3 & (ymm16 ^ ymm21))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm24)
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10]
-; AVX512DQ-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm3))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm22
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm8 & (zmm22 ^ zmm23))
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm18 ^ ymm17))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm3
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm21 ^ ymm16))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5,6],ymm5[7,8],ymm3[9,10],ymm5[11],ymm3[12,13,14],ymm5[15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm0
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm28
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm14, %xmm11
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm11, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm9))
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm17 ^ (ymm9 & (ymm19 ^ ymm17))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm11
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm12 ^ (ymm11 & (ymm16 ^ ymm12))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7,8,9,10],ymm14[11],ymm11[12,13],ymm14[14],ymm11[15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm9 & ymm24)
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm20 ^ (ymm9 & (ymm21 ^ ymm20))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10]
+; AVX512DQ-NEXT:    vpor %xmm14, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm6 & (ymm9 ^ ymm11))
+; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm26
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm29 & (zmm4 ^ zmm2))
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm25 ^ ymm18))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm9, %xmm2
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm10 ^ (ymm9 & (ymm13 ^ ymm10))
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8,9,10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm22 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm2 & ymm22)
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm30 ^ ymm31))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm11
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[1,8,15,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm11, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa %xmm1, %xmm4
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm6
+; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm14, %xmm11
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm11, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm11 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm9 ^ (zmm11 & (zmm2 ^ zmm9))
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm17 ^ (ymm9 & (ymm19 ^ ymm17))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm14
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u]
+; AVX512DQ-NEXT:    vpor %xmm14, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm16 ^ (ymm14 & (ymm12 ^ ymm16))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm14[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1,2],ymm3[3],ymm14[4,5,6],ymm3[7,8],ymm14[9,10],ymm3[11],ymm14[12,13,14],ymm3[15]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm24)
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11]
-; AVX512DQ-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm3))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm23
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm8 & (zmm23 ^ zmm25))
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm21 ^ ymm16))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm17 ^ (ymm2 & (ymm18 ^ ymm17))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm9 & ymm24)
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm20 ^ (ymm9 & (ymm21 ^ ymm20))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13],zero,zero
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11]
+; AVX512DQ-NEXT:    vpor %xmm14, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm26 & (ymm9 ^ ymm3))
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm23
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm29 & (zmm23 ^ zmm2))
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm25 ^ ymm18))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm24) | ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm2))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm25
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm8 & (zmm25 ^ zmm30))
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm21 ^ ymm16))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18))
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm13 ^ ymm10))
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5,6],ymm7[7,8],ymm3[9,10],ymm7[11],ymm3[12,13,14],ymm7[15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm22)
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm30 ^ ymm31))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm4, %xmm9
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-NEXT:    vpor %xmm9, %xmm14, %xmm9
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm28 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm28 & (ymm9 ^ ymm2))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm0[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, %xmm6
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm14, %xmm2
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm9, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm11 & (zmm2 ^ zmm3))
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm12 ^ ymm16))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2,3],ymm9[4],ymm3[5,6],ymm9[7,8],ymm3[9,10,11],ymm9[12],ymm3[13,14],ymm9[15]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm19 ^ ymm17))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u]
+; AVX512DQ-NEXT:    vpor %xmm7, %xmm9, %xmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = (ymm7 & ymm24) | ymm3
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm21 ^ (ymm3 & (ymm20 ^ ymm21))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm9
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm9, %xmm3
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm26 & (ymm3 ^ ymm7))
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm22
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm2))
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm12 ^ ymm16))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm17 ^ ymm19))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
+; AVX512DQ-NEXT:    vpor %xmm7, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm24) | ymm2
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm20 ^ ymm21))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
+; AVX512DQ-NEXT:    vpor %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm26 & (ymm4 ^ ymm3))
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm16 ^ ymm12))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm17 ^ ymm19))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm7, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm2
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm20 ^ ymm21))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
+; AVX512DQ-NEXT:    vpor %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm0 & (ymm7 ^ ymm3))
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm18 ^ ymm25))
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm24) | ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm24
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm24 = ymm24 ^ (ymm29 & (ymm24 ^ ymm2))
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm16 ^ ymm21))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm2
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vporq %xmm3, %xmm2, %xmm24
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm13 ^ ymm10))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm27 = ymm20 ^ (ymm27 & (ymm21 ^ ymm20))
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm18 ^ ymm25))
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm30
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm29 & (ymm30 ^ ymm2))
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vporq %xmm2, %xmm0, %xmm29
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm19 = ymm26 ^ (ymm19 & (ymm27 ^ ymm26))
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm15
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm13 ^ ymm12))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm5, %xmm3
-; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm5
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm17 = ymm18 ^ (ymm14 & (ymm17 ^ ymm18))
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm9
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm28 ^ ymm31))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm28 ^ ymm31))
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6],ymm4[7,8],ymm2[9,10,11],ymm4[12],ymm2[13,14],ymm4[15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm29 & ymm11)
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5,6,7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13,14,15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm0 & ymm1)
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7,8],ymm4[9],ymm14[10,11,12],ymm4[13],ymm14[14,15]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm3 & ymm1)
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm5 & (ymm13 ^ ymm6))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vporq %xmm3, %xmm2, %xmm20
+; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm17 = ymm19 ^ (ymm5 & (ymm17 ^ ymm19))
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm13 ^ (ymm5 & (ymm10 ^ ymm13))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2,3],ymm3[4],ymm11[5,6],ymm3[7,8],ymm11[9,10,11],ymm3[12],ymm11[13,14],ymm3[15]
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm11
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm18 ^ ymm25))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm1 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm24 = (ymm24 & ymm1) | ymm3
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm20 = (ymm20 & ymm1) | ymm2
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm5[2,3,0,1]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7,8],ymm2[9],ymm5[10,11,12],ymm2[13],ymm5[14,15]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm8[6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm10
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm1) | ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm30 ^ ymm31))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm10, %xmm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm1, %xmm2
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
 ; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm18 & (ymm2 ^ ymm0))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm16 = ymm21 ^ (ymm7 & (ymm16 ^ ymm21))
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm6 ^ (ymm15 & (ymm13 ^ ymm6))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm13 ^ (ymm7 & (ymm6 ^ ymm13))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm28 & (ymm2 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm16 = ymm12 ^ (ymm15 & (ymm16 ^ ymm12))
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm12 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm31 ^ (ymm11 & (ymm30 ^ ymm31))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm30 ^ (ymm15 & (ymm31 ^ ymm30))
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm14, %xmm0
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm8
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm12))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm5, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-NEXT:    vpor %xmm5, %xmm8, %xmm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm24))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm11[u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm3
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; AVX512DQ-NEXT:    vmovdqa %xmm9, %xmm11
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm9, %xmm3
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm28 & (ymm3 ^ ymm2))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm9, %xmm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm1, %xmm5
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-NEXT:    vpor %xmm5, %xmm11, %xmm5
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm18 & (ymm5 ^ ymm3))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm7, %xmm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-NEXT:    vpor %xmm12, %xmm8, %xmm8
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm18 & (ymm8 ^ ymm3))
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm28 & (ymm5 ^ ymm2))
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm20))
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm6, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm11))
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm13, %xmm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm8, %zmm5
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm9))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm10))
 ; AVX512DQ-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm24, %zmm0, %zmm2 {%k1}
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm30, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm4, %zmm0, %zmm8 {%k1}
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm7, %zmm0, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm1
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm16[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm1
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
 ; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm17, %xmm1
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm6, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpor %xmm5, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm4
-; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm19, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm4
+; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm27, %xmm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
 ; AVX512DQ-NEXT:    vpor %xmm0, %xmm4, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm22, (%rdx)
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm23, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, (%r8)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm22, (%r8)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%r9)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512DQ-NEXT:    addq $24, %rsp
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-NEXT:    addq $56, %rsp
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    pushq %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm27
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm11 ^ ymm12))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %ymm29
+; AVX512DQ-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm31
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm31 ^ (ymm0 & (ymm29 ^ ymm31))
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm31
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm27 ^ (ymm2 & (ymm31 ^ ymm27))
+; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm10 ^ (ymm1 & (ymm12 ^ ymm10))
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm28
-; AVX512DQ-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm30
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm27
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm26 ^ (ymm2 & (ymm27 ^ ymm26))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm22
-; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm4, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 240(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm2))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 288(%rdi), %ymm16
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm9, %xmm5
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm2, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm19 & (zmm9 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm15
 ; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm1 ^ ymm16))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm1 ^ ymm15))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm13
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4,5],ymm8[6],ymm13[7,8,9],ymm8[10],ymm13[11,12,13],ymm8[14],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm10 & ymm26)
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 416(%rdi), %ymm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm18
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm17 ^ (ymm13 & (ymm18 ^ ymm17))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm21 & (ymm13 ^ ymm8))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm29 & (zmm3 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm2 ^ ymm30))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7,8,9],ymm5[10],ymm13[11,12,13],ymm5[14],ymm13[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm11 & ymm25)
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 416(%rdi), %ymm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm17
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm11 & (ymm17 ^ ymm16))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm8, %xmm11, %xmm8
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm20 & (ymm8 ^ ymm5))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT:    vpmovsxwd {{.*#+}} zmm28 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm28 & (zmm5 ^ zmm9))
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm31 ^ (ymm5 & (ymm29 ^ ymm31))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm8
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm10 ^ ymm12))
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7,8,9],ymm3[10],ymm8[11,12,13],ymm3[14],ymm8[15]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ~mem)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm28 ^ (ymm7 & (ymm30 ^ ymm28))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm7
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm13, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm8))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm8
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm16 ^ ymm1))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm5 & ~mem)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm26 ^ (ymm5 & (ymm27 ^ ymm26))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm9, %xmm5
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,4,6]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm19 & (zmm5 ^ zmm8))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm15 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm13
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm30 ^ (ymm9 & (ymm2 ^ ymm30))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ymm25)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm16 ^ (ymm8 & (ymm17 ^ ymm16))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10]
-; AVX512DQ-FCP-NEXT:    vpor %xmm15, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm8, %xmm8
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm21 & (ymm8 ^ ymm13))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm24
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm8
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm20 & (ymm8 ^ ymm9))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm23
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm28 & (zmm23 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm31 ^ (ymm5 & (ymm29 ^ ymm31))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm10 ^ ymm12))
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8,9,10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm23 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm23)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm13 ^ (zmm22 & (zmm3 ^ zmm13))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm16 ^ ymm1))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm14 ^ ymm2))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5,6],ymm15[7,8],ymm13[9,10],ymm15[11],ymm13[12,13,14],ymm15[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm26)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm15, %xmm7
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm21 & (ymm7 ^ ymm13))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm20
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm29 & (zmm20 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm11 ^ ymm12))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm22 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm5 & ymm22)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm27 ^ (ymm5 & (ymm26 ^ ymm27))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm9, %xmm5
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,5,6]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm5, %zmm9
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm21 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm8 ^ (zmm21 & (zmm9 ^ zmm8))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm15 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm11
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm30 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2],ymm14[3],ymm11[4,5,6],ymm14[7,8],ymm11[9,10],ymm14[11],ymm11[12,13,14],ymm14[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm8 & ymm25)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm16 ^ (ymm8 & (ymm17 ^ ymm16))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11]
+; AVX512DQ-FCP-NEXT:    vpor %xmm14, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm20 & (ymm8 ^ ymm11))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm19
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm9))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm29 ^ ymm31))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm10 ^ ymm12))
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5,6],ymm3[7,8],ymm9[9,10],ymm3[11],ymm9[12,13,14],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ymm22)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm27 ^ (ymm3 & (ymm26 ^ ymm27))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa 208(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa 208(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm5
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm25 & (ymm3 ^ ymm8))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm24 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm24 & (ymm3 ^ ymm5))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm4, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm22 & (zmm3 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm14 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm9 ^ (zmm21 & (zmm3 ^ zmm9))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm30 ^ ymm2))
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm5
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm16 ^ ymm1))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm15 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm26) | ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm18 ^ (ymm4 & (ymm17 ^ ymm18))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12]
+; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm25) | ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm17 ^ (ymm4 & (ymm16 ^ ymm17))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm4, %xmm4
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm21 & (ymm4 ^ ymm5))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm20 & (ymm4 ^ ymm5))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm21
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm28 & (zmm21 ^ zmm3))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm30 ^ ymm2))
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16))
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm28
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm15 ^ (ymm4 & (ymm1 ^ ymm15))
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm25) | ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm16 ^ ymm17))
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm3, %ymm0, %ymm26
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm21 & (ymm26 ^ ymm4))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm14 ^ (ymm3 & (ymm2 ^ ymm14))
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm3, %ymm0, %ymm25
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm25 = ymm25 ^ (ymm20 & (ymm25 ^ ymm4))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm2 ^ ymm30))
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm4
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm15 ^ (ymm4 & (ymm1 ^ ymm15))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm4, %xmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = (ymm7 & ~mem) | ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
+; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ~mem) | ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm16 ^ ymm17))
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm3, %ymm0, %ymm29
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm21 & (ymm29 ^ ymm7))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm12 ^ ymm11))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm8
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm12 ^ ymm11))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm11 ^ (ymm6 & (ymm12 ^ ymm11))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm12, %xmm13
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm20 & (ymm4 ^ ymm5))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm12 ^ (ymm5 & (ymm10 ^ ymm12))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm18 = ymm16 ^ (ymm18 & (ymm17 ^ ymm16))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm12 ^ ymm10))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm15 ^ (ymm0 & (ymm1 ^ ymm15))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm12 ^ ymm10))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm29 ^ (ymm8 & (ymm31 ^ ymm29))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm10, %xmm8, %xmm10
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm31 ^ ymm29))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm29 ^ (ymm7 & (ymm31 ^ ymm29))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm12
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm16 ^ (ymm9 & (ymm1 ^ ymm16))
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm15
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm27 ^ (ymm15 & (ymm31 ^ ymm27))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vporq %xmm11, %xmm0, %xmm16
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6],ymm4[7,8],ymm8[9,10,11],ymm4[12],ymm8[13,14],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ymm23)
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm3 & ymm23)
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7,8],ymm4[9],ymm9[10,11,12],ymm4[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4],ymm5[5,6],ymm3[7,8],ymm5[9,10,11],ymm3[12],ymm5[13,14],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm22) | ymm3
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7,8],ymm3[9],ymm11[10,11],ymm3[12],ymm11[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm22) | ymm3
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm16 & ymm23)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm10 & (ymm2 ^ ymm14))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm28 ^ (ymm10 & (ymm30 ^ ymm28))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm9, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm9
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm25 & (ymm9 ^ ymm3))
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm22) | ymm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm13 & (ymm2 ^ ymm30))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm26 ^ ymm27))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm27 ^ (ymm12 & (ymm26 ^ ymm27))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm26 ^ (ymm13 & (ymm27 ^ ymm26))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm25 & (ymm7 ^ ymm3))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm12, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm25 & (ymm10 ^ ymm3))
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,2,4,6,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm9, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm11))
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,3,4,6,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm8))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm24 & (ymm3 ^ ymm0))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm7, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm24 & (ymm6 ^ ymm0))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm24 & (ymm9 ^ ymm7))
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,2,4,6,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm7 & (zmm3 ^ zmm10))
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm7 & (zmm6 ^ zmm8))
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [1,3,5,6,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm8, %ymm8
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm10, %zmm8
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm9 & (zmm8 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm9, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm7 & (zmm8 ^ zmm5))
 ; AVX512DQ-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm26, %zmm0, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm29, %zmm0, %zmm7 {%k1}
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm25, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm4, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
+; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -13336,367 +13373,374 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%rax)
-; AVX512DQ-FCP-NEXT:    popq %rax
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i8_stride7_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm25
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
-; AVX512BW-NEXT:    vpermw %zmm25, %zmm0, %zmm18
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
-; AVX512BW-NEXT:    vpermw %zmm25, %zmm0, %zmm24
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
-; AVX512BW-NEXT:    vpermw %zmm25, %zmm0, %zmm9
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
-; AVX512BW-NEXT:    vpermw %zmm25, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm10
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm7
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX512BW-NEXT:    vmovdqa 96(%rdi), %ymm2
 ; AVX512BW-NEXT:    movw $-28382, %ax # imm = 0x9122
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm3 {%k1}
+; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm7, %ymm0 {%k1}
 ; AVX512BW-NEXT:    kmovq %k1, %k2
-; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
-; AVX512BW-NEXT:    vporq %xmm4, %xmm3, %xmm16
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512BW-NEXT:    vporq %xmm3, %xmm0, %xmm17
+; AVX512BW-NEXT:    movw $9288, %ax # imm = 0x2448
+; AVX512BW-NEXT:    kmovd %eax, %k6
+; AVX512BW-NEXT:    vpblendmw %ymm11, %ymm2, %ymm0 {%k6}
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu16 %ymm0, %ymm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512BW-NEXT:    vmovdqu16 %ymm0, %ymm17 {%k1}
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm12
 ; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm6
 ; AVX512BW-NEXT:    movw $8772, %ax # imm = 0x2244
-; AVX512BW-NEXT:    kmovd %eax, %k6
-; AVX512BW-NEXT:    vpblendmw %ymm11, %ymm6, %ymm0 {%k6}
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    vpblendmw %ymm12, %ymm6, %ymm0 {%k3}
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm7
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512BW-NEXT:    vpshufb %xmm21, %xmm7, %xmm3
-; AVX512BW-NEXT:    vmovdqa 208(%rdi), %xmm8
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm8
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm22 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-NEXT:    vpshufb %xmm22, %xmm8, %xmm3
+; AVX512BW-NEXT:    vmovdqa 208(%rdi), %xmm9
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX512BW-NEXT:    vmovdqa64 240(%rdi), %xmm26
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqa 224(%rdi), %xmm4
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm5, %xmm12, %xmm5
+; AVX512BW-NEXT:    vmovdqa64 240(%rdi), %xmm24
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm24[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovdqa64 224(%rdi), %xmm25
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm25[0,7,14],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm5, %xmm10, %xmm5
 ; AVX512BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm0
 ; AVX512BW-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
 ; AVX512BW-NEXT:    kmovq %rax, %k5
-; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm16 {%k5}
-; AVX512BW-NEXT:    vmovdqa 288(%rdi), %ymm13
-; AVX512BW-NEXT:    vmovdqa 256(%rdi), %ymm12
-; AVX512BW-NEXT:    movw $9288, %ax # imm = 0x2448
-; AVX512BW-NEXT:    kmovd %eax, %k3
-; AVX512BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm0 {%k3}
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm17 {%k5}
+; AVX512BW-NEXT:    vmovdqa 288(%rdi), %ymm15
+; AVX512BW-NEXT:    vmovdqa 256(%rdi), %ymm14
+; AVX512BW-NEXT:    vpblendmw %ymm15, %ymm14, %ymm0 {%k6}
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
-; AVX512BW-NEXT:    vporq %xmm5, %xmm0, %xmm19
-; AVX512BW-NEXT:    vmovdqa64 352(%rdi), %ymm17
+; AVX512BW-NEXT:    vporq %xmm5, %xmm0, %xmm20
+; AVX512BW-NEXT:    vmovdqa64 352(%rdi), %ymm18
 ; AVX512BW-NEXT:    vmovdqa 320(%rdi), %ymm0
-; AVX512BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm5 {%k6}
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15]
+; AVX512BW-NEXT:    vpblendmw %ymm18, %ymm0, %ymm5 {%k3}
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm10[2],ymm5[3,4,5],ymm10[6],ymm5[7,8,9],ymm10[10],ymm5[11,12,13],ymm10[14],ymm5[15]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    movw $3968, %ax # imm = 0xF80
 ; AVX512BW-NEXT:    kmovd %eax, %k7
-; AVX512BW-NEXT:    vmovdqu16 %ymm5, %ymm19 {%k7}
-; AVX512BW-NEXT:    vmovdqa 416(%rdi), %ymm15
+; AVX512BW-NEXT:    vmovdqu16 %ymm5, %ymm20 {%k7}
+; AVX512BW-NEXT:    vmovdqa64 416(%rdi), %ymm16
 ; AVX512BW-NEXT:    vmovdqa 384(%rdi), %ymm5
 ; AVX512BW-NEXT:    movw $4644, %ax # imm = 0x1224
 ; AVX512BW-NEXT:    kmovd %eax, %k4
-; AVX512BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm20 {%k4}
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm22
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
-; AVX512BW-NEXT:    vporq %xmm22, %xmm20, %xmm20
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512BW-NEXT:    vpblendmw %ymm16, %ymm5, %ymm10 {%k4}
+; AVX512BW-NEXT:    vextracti32x4 $1, %ymm10, %xmm19
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11],zero,zero
+; AVX512BW-NEXT:    vporq %xmm19, %xmm10, %xmm10
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm10, %ymm0, %ymm21
 ; AVX512BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
-; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm22 {%k4}
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm23
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vporq %xmm23, %xmm22, %xmm22
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm7, %ymm10 {%k4}
+; AVX512BW-NEXT:    vextracti32x4 $1, %ymm10, %xmm19
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vporq %xmm19, %xmm10, %xmm19
+; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm11, %ymm10 {%k2}
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    vextracti128 $1, %ymm10, %xmm13
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7,8,9],ymm13[10],ymm10[11,12,13],ymm13[14],ymm10[15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    movl $511, %edi # imm = 0x1FF
 ; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vmovdqu8 %ymm22, %ymm9 {%k1}
-; AVX512BW-NEXT:    vpblendmw %ymm11, %ymm6, %ymm22 {%k3}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u]
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm22
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u]
-; AVX512BW-NEXT:    vporq %xmm23, %xmm22, %xmm22
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm14
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vporq %xmm14, %xmm22, %xmm14
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm9 {%k5}
-; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm2 {%k6}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm2, %xmm14, %xmm2
+; AVX512BW-NEXT:    vmovdqu8 %ymm19, %ymm10 {%k1}
+; AVX512BW-NEXT:    vpblendmw %ymm12, %ymm6, %ymm13 {%k6}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u]
+; AVX512BW-NEXT:    vporq %xmm19, %xmm13, %xmm13
+; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm19 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm24[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm25[1,8,15],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vporq %xmm13, %xmm19, %xmm13
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm13, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm10 {%k5}
+; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm11, %ymm3 {%k4}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm13
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8,9,10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15]
+; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm7, %ymm13 {%k3}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vporq %xmm19, %xmm13, %xmm13
 ; AVX512BW-NEXT:    movl $261632, %edi # imm = 0x3FE00
 ; AVX512BW-NEXT:    kmovd %edi, %k5
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm14 {%k2}
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm14, %xmm22
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
-; AVX512BW-NEXT:    vporq %xmm22, %xmm14, %xmm14
-; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT:    vpshufb %xmm21, %xmm8, %xmm21
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vporq %xmm14, %xmm21, %xmm14
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm3, %zmm22
-; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm2 {%k3}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm3 {%k4}
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm3, %xmm18
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm13 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm12, %ymm3 {%k2}
+; AVX512BW-NEXT:    vextracti32x4 $1, %ymm3, %xmm19
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u]
+; AVX512BW-NEXT:    vporq %xmm19, %xmm3, %xmm3
+; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vpshufb %xmm22, %xmm9, %xmm19
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm19 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm25[2,9],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vporq %xmm4, %xmm19, %xmm4
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm4, %zmm3, %zmm22
+; AVX512BW-NEXT:    vmovdqu16 %zmm13, %zmm22 {%k1}
+; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm11, %ymm3 {%k3}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15]
+; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm7, %ymm4 {%k6}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm4[3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm4, %xmm13, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm12, %ymm3 {%k4}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm13
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
-; AVX512BW-NEXT:    vporq %xmm18, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpor %xmm3, %xmm13, %xmm3
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-NEXT:    vporq %xmm18, %xmm21, %xmm18
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-NEXT:    vporq %xmm13, %xmm19, %xmm13
+; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512BW-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
 ; AVX512BW-NEXT:    kmovd %edi, %k2
 ; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    vmovdqu8 %ymm18, %ymm3 {%k2}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vporq %xmm18, %xmm21, %xmm18
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm18, %zmm3, %zmm18
-; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm3 {%k2}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm25[3,10],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vporq %xmm13, %xmm19, %xmm13
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm13, %zmm3, %zmm19
+; AVX512BW-NEXT:    vmovdqu16 %zmm4, %zmm19 {%k1}
 ; AVX512BW-NEXT:    kmovd %eax, %k2
-; AVX512BW-NEXT:    vmovdqu8 %ymm20, %ymm19 {%k2}
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu8 %ymm21, %ymm20 {%k2}
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm0, %zmm3
 ; AVX512BW-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
 ; AVX512BW-NEXT:    kmovq %rax, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm16 {%k1}
-; AVX512BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k4}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm3 {%k3}
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm3 {%k6}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
-; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm9 {%k1}
-; AVX512BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k6}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm3 {%k4}
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm3 {%k3}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm17 {%k1}
+; AVX512BW-NEXT:    vpblendmw %ymm14, %ymm15, %ymm3 {%k4}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u]
+; AVX512BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpblendmw %ymm18, %ymm0, %ymm4 {%k6}
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7,8,9,10],ymm13[11],ymm4[12,13],ymm13[14],ymm4[15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512BW-NEXT:    vpblendmw %ymm16, %ymm5, %ymm4 {%k3}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero
+; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10]
+; AVX512BW-NEXT:    vpor %xmm4, %xmm13, %xmm4
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm10 {%k1}
+; AVX512BW-NEXT:    vpblendmw %ymm14, %ymm15, %ymm3 {%k3}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u]
+; AVX512BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm18, %ymm4 {%k4}
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2],ymm13[3],ymm4[4,5,6],ymm13[7,8],ymm4[9,10],ymm13[11],ymm4[12,13,14],ymm13[15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512BW-NEXT:    vpblendmw %ymm16, %ymm5, %ymm4 {%k6}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero
+; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11]
+; AVX512BW-NEXT:    vpor %xmm4, %xmm13, %xmm4
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm22 {%k1}
+; AVX512BW-NEXT:    vpblendmw %ymm14, %ymm15, %ymm3 {%k6}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u]
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k3}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm3 {%k6}
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm3 {%k4}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm14
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
-; AVX512BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm18 {%k1}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u]
+; AVX512BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm18, %ymm4 {%k3}
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2,3],ymm13[4],ymm4[5,6],ymm13[7,8],ymm4[9,10,11],ymm13[12],ymm4[13,14],ymm13[15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm16, %ymm4 {%k4}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm13
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero
+; AVX512BW-NEXT:    vpor %xmm4, %xmm13, %xmm4
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm19 {%k1}
 ; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm2 {%k1}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
-; AVX512BW-NEXT:    vporq %xmm3, %xmm2, %xmm19
-; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm2 {%k3}
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqu16 %ymm2, %ymm19 {%k7}
-; AVX512BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm2 {%k4}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm3 {%k1}
+; AVX512BW-NEXT:    vpblendmw %ymm15, %ymm14, %ymm3 {%k1}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
+; AVX512BW-NEXT:    vporq %xmm4, %xmm3, %xmm20
+; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm18, %ymm3 {%k6}
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm20 {%k7}
+; AVX512BW-NEXT:    vpblendmw %ymm15, %ymm14, %ymm3 {%k4}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpblendmw %ymm18, %ymm0, %ymm4 {%k1}
 ; AVX512BW-NEXT:    kmovq %k1, %k7
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4],ymm13[5],ymm4[6,7,8],ymm13[9],ymm4[10,11,12],ymm13[13],ymm4[14,15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm21 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    movl $8176, %eax # imm = 0x1FF0
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm20 {%k1}
-; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm2 {%k6}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
-; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm19 {%k2}
-; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm2 {%k3}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm20 {%k2}
-; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm2 {%k6}
-; AVX512BW-NEXT:    vpblendmw %ymm10, %ymm1, %ymm21 {%k6}
-; AVX512BW-NEXT:    vmovdqu16 %ymm17, %ymm0 {%k4}
+; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm21 {%k1}
+; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm16, %ymm3 {%k3}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero
+; AVX512BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm20 {%k2}
+; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm16, %ymm3 {%k6}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14]
+; AVX512BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm21 {%k2}
+; AVX512BW-NEXT:    vmovdqu16 %ymm18, %ymm0 {%k4}
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512BW-NEXT:    vmovdqu16 %ymm13, %ymm12 {%k6}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm0, %xmm12, %xmm0
+; AVX512BW-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k3}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm14, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm0, %xmm4, %xmm0
 ; AVX512BW-NEXT:    movl $4186112, %eax # imm = 0x3FE000
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqu16 %ymm15, %ymm5 {%k7}
-; AVX512BW-NEXT:    vpblendmw %ymm10, %ymm1, %ymm12 {%k4}
-; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm3 {%k3}
-; AVX512BW-NEXT:    vmovdqu16 %ymm10, %ymm1 {%k3}
-; AVX512BW-NEXT:    vmovdqu16 %ymm11, %ymm6 {%k4}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm2, %xmm10, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512BW-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
-; AVX512BW-NEXT:    vmovdqu8 %ymm10, %ymm2 {%k1}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u]
+; AVX512BW-NEXT:    vmovdqu16 %ymm16, %ymm5 {%k7}
+; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm11, %ymm13 {%k6}
+; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm12, %ymm3 {%k3}
+; AVX512BW-NEXT:    vpblendmw %ymm11, %ymm2, %ymm15 {%k4}
+; AVX512BW-NEXT:    vpblendmw %ymm7, %ymm1, %ymm14 {%k3}
+; AVX512BW-NEXT:    vmovdqu16 %ymm11, %ymm2 {%k3}
+; AVX512BW-NEXT:    vpblendmw %ymm7, %ymm1, %ymm4 {%k4}
+; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm12, %ymm11 {%k6}
+; AVX512BW-NEXT:    vmovdqu16 %ymm7, %ymm1 {%k6}
+; AVX512BW-NEXT:    vmovdqu16 %ymm12, %ymm6 {%k4}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u]
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm10, %xmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm7, %xmm3, %xmm3
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
-; AVX512BW-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vmovdqu8 %ymm10, %ymm3 {%k1}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm10
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm6, %xmm10, %xmm6
-; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
-; AVX512BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-NEXT:    vpor %xmm7, %xmm12, %xmm7
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k1}
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
-; AVX512BW-NEXT:    vpermw %zmm25, %zmm7, %zmm7
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
-; AVX512BW-NEXT:    vpermw %zmm25, %zmm8, %zmm8
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
-; AVX512BW-NEXT:    vpermw %zmm25, %zmm10, %zmm10
-; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm11
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    vmovdqu8 %ymm7, %ymm3 {%k1}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm11[u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm7, %xmm11, %xmm7
+; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
 ; AVX512BW-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm12
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm11 {%k5}
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm2
+; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT:    vmovdqu8 %ymm11, %ymm7 {%k1}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm11
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm6, %xmm11, %xmm6
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
+; AVX512BW-NEXT:    vpor %xmm8, %xmm9, %xmm8
+; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-NEXT:    vmovdqu8 %ymm8, %ymm6 {%k1}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm8
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm13[1,2,3],ymm8[4],ymm13[5,6],ymm8[7,8],ymm13[9,10,11],ymm8[12],ymm13[13,14],ymm8[15]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm9
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm4, %xmm9, %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm25, %xmm9
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm24[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm4 {%k5}
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm0, %zmm3
 ; AVX512BW-NEXT:    movw $-512, %ax # imm = 0xFE00
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm21, %xmm12
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm12, %xmm13, %xmm12
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm12 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm12 {%k5}
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm15, %xmm9
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7,8],ymm9[9],ymm15[10,11],ymm9[12],ymm15[13,14,15]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm14, %xmm11
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm11, %xmm12, %xmm11
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm11 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm24[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm25[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm11 {%k5}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm7
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm26, %xmm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm6, %zmm3
-; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm1 {%k5}
+; AVX512BW-NEXT:    vpor %xmm7, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm24, %xmm2
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm25[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm6, %zmm2
+; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm1 {%k5}
 ; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm11 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm12 {%k1}
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm2
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
@@ -13708,123 +13752,111 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, (%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%rdi)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i8_stride7_vf64:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm24
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm13
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm25
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm12
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm16
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm8
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm5
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX512BW-FCP-NEXT:    movw $-28382, %ax # imm = 0x9122
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm0 {%k1}
 ; AVX512BW-FCP-NEXT:    kmovq %k1, %k2
 ; AVX512BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,17,10,3,20,13,6,0,24,0,0,27,0,0,0,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
 ; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm9
 ; AVX512BW-FCP-NEXT:    movw $8772, %ax # imm = 0x2244
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm5 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
 ; AVX512BW-FCP-NEXT:    kmovq %k1, %k3
 ; AVX512BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6]
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm6, %ymm6
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 240(%rdi), %xmm19
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %xmm20
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm12
+; AVX512BW-FCP-NEXT:    vpermd %ymm12, %ymm5, %ymm5
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-FCP-NEXT:    vmovdqa 240(%rdi), %xmm13
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %xmm16
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm16[0,7,14],zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm4, %zmm4
 ; AVX512BW-FCP-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k5
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k5}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k5}
 ; AVX512BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm6
 ; AVX512BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm5
 ; AVX512BW-FCP-NEXT:    movw $9288, %ax # imm = 0x2448
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm7 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm7, %xmm21
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm4 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm7, %xmm4, %xmm17
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,1,0,0,4,0,0,0,8,25,18,11,0,29,22,15]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movw $3968, %ax # imm = 0xF80
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k7
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm21 {%k7}
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm17 {%k7}
 ; AVX512BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm7
 ; AVX512BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX512BW-FCP-NEXT:    movw $4644, %ax # imm = 0x1224
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k4
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm4, %ymm18 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm22
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm22
-; AVX512BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm23
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm4, %ymm8 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm14, %xmm8, %xmm18
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm8 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm8, %xmm14
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,17,10,3,0,21,14,7,24,0,0,0,28,0,0,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm8, %zmm8
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    movl $511, %r10d # imm = 0x1FF
-; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm8 {%k1}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
-; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm18, %ymm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512BW-FCP-NEXT:    movl $511, %eax # imm = 0x1FF
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm14 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6]
+; AVX512BW-FCP-NEXT:    vpermd %ymm12, %ymm15, %ymm15
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm18, %xmm15
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm14, %zmm14
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm8 {%k5}
 ; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
@@ -13832,144 +13864,156 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    movl $261632, %r10d # imm = 0x3FE00
-; AVX512BW-FCP-NEXT:    kmovd %r10d, %k5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm12, %xmm12
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
-; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm15, %ymm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [0,0,18,11,4,21,14,7,0,25,0,0,28,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm15, %zmm15
+; AVX512BW-FCP-NEXT:    movl $261632, %eax # imm = 0x3FE00
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k5
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm15 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm19
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,3,5,6]
+; AVX512BW-FCP-NEXT:    vpermd %ymm12, %ymm19, %ymm12
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm16[2,9],zero,zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm13[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
 ; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
+; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm14, %xmm19
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [8,1,18,11,4,0,22,15,0,25,0,0,0,29,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm14, %zmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm19 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm14 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT:    vmovdqa64 208(%rdi), %xmm17
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %xmm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm23, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm14, %xmm20
+; AVX512BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm15
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm22, %xmm21
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
 ; AVX512BW-FCP-NEXT:    movl $-134217728, %r10d # imm = 0xF8000000
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
 ; AVX512BW-FCP-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm13 {%k2}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm13, %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm20 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[3,10],zero,zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm20, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm19, %zmm13 {%k1}
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm17 {%k3}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm0, %zmm16
 ; AVX512BW-FCP-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm14 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm0 {%k2}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm16 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero,xmm16[u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [0,1,0,0,0,5,0,0,8,0,26,19,12,29,22,15]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm16 {%k7}
 ; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm4, %ymm15 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
-; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm8 {%k2}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm14 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm4, %ymm15 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
-; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm12 {%k2}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm14 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm7, %ymm15 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm13 {%k2}
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm15
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm16
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm4, %ymm17 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm17[5,12],zero,zero
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14],zero,zero,xmm17[3,10]
+; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm8 {%k2}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm16 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [0,0,2,0,0,5,0,0,16,9,26,19,12,0,30,23]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm4, %ymm17 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
+; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm12 {%k2}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm16 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm16[u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[u,u]
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [0,0,2,0,0,0,6,0,16,9,0,27,20,13,30,23]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm7, %ymm17 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,4,11],zero,zero,xmm17[0,7,14],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm13 {%k2}
 ; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm14 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm19
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm16, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm16 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm16 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [0,0,0,3,0,0,6,0,24,17,10,27,20,13,0,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm17 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,1,8,15],zero,zero,xmm17[4,11],zero,zero,xmm17[u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm18
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [0,0,0,3,0,0,0,7,24,17,10,0,28,21,14,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movl $8176, %eax # imm = 0x1FF0
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm17 {%k1}
 ; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm7, %ymm16 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm7, %ymm18 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm19
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm7, %ymm16 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,5,12],zero,zero,xmm18[1,8,15],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm16 {%k3}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm7, %ymm18 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm18[u,u,u,u,u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14]
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm17 {%k3}
 ; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm18 {%k1}
 ; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k6}
 ; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
@@ -13979,8 +14023,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
 ; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm21, %xmm11
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
@@ -13990,8 +14034,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm20, %xmm11
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
 ; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm21, %xmm20
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm20, %ymm11 {%k3}
@@ -13999,82 +14043,82 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm9, %ymm0, %ymm20
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
+; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm14, %xmm9
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
-; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm9 {%k3}
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm17, %zmm17
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm18, %zmm18
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm20, %zmm20
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm2, %xmm19, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm20
-; AVX512BW-FCP-NEXT:    vpermd %ymm20, %ymm19, %ymm19
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm19, %zmm10, %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm2 {%k5}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm10
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm9, %ymm20 {%k3}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm9
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm14, %xmm9
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [8,1,0,19,12,5,22,15,0,0,26,0,0,29,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm14, %zmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,2,4,6,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vmovdqa 224(%rdi), %ymm15
+; AVX512BW-FCP-NEXT:    vpermd %ymm15, %ymm14, %ymm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm10, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm9 {%k5}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm10
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm10, %xmm16, %xmm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermd %ymm20, %ymm16, %ymm16
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm16, %zmm11, %zmm11
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm14, %xmm10
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,0,23,0,0,26,0,0,0,30,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm14, %zmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,3,4,6,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermd %ymm15, %ymm14, %ymm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm11, %zmm11
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm11, %zmm10 {%k5}
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermd %ymm20, %ymm11, %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm9, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm3 {%k5}
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [16,9,2,0,20,13,6,23,0,0,0,27,0,0,30,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm11, %zmm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,5,6,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermd %ymm15, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm20, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k5}
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u]
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm5, %xmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,0,0,0,4,0,0,7,0,25,18,11,28,21,14,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm1
 ; AVX512BW-FCP-NEXT:    movl $4186112, %eax # imm = 0x3FE000
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm4 {%k2}
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm0
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm1, %zmm9 {%k1}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm1, %zmm10 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm4, %xmm0
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm4, %xmm1
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm1, %zmm3 {%k1}
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, (%r9)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rdi)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -14082,366 +14126,374 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-BW-LABEL: load_i8_stride7_vf64:
 ; AVX512DQ-BW:       # %bb.0:
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm25
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm0, %zmm18
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
-; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm0, %zmm24
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
-; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm0, %zmm10
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
-; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm7
 ; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %ymm2
 ; AVX512DQ-BW-NEXT:    movw $-28382, %ax # imm = 0x9122
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm3 {%k1}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm7, %ymm0 {%k1}
 ; AVX512DQ-BW-NEXT:    kmovq %k1, %k2
-; AVX512DQ-BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vporq %xmm4, %xmm3, %xmm16
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vporq %xmm3, %xmm0, %xmm17
+; AVX512DQ-BW-NEXT:    movw $9288, %ax # imm = 0x2448
+; AVX512DQ-BW-NEXT:    kmovd %eax, %k6
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm10, %ymm2, %ymm0 {%k6}
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm0, %ymm16 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm0, %ymm17 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm12
 ; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm6
 ; AVX512DQ-BW-NEXT:    movw $8772, %ax # imm = 0x2244
-; AVX512DQ-BW-NEXT:    kmovd %eax, %k6
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm11, %ymm6, %ymm0 {%k6}
+; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm12, %ymm6, %ymm0 {%k3}
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %xmm7
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm21, %xmm7, %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa 208(%rdi), %xmm8
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %xmm8
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm22 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm22, %xmm8, %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa 208(%rdi), %xmm9
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 240(%rdi), %xmm26
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vmovdqa 224(%rdi), %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 240(%rdi), %xmm24
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm24[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vmovdqa64 224(%rdi), %xmm25
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm25[0,7,14],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm11, %xmm5
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm0
 ; AVX512DQ-BW-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm16 {%k5}
-; AVX512DQ-BW-NEXT:    vmovdqa 288(%rdi), %ymm13
-; AVX512DQ-BW-NEXT:    vmovdqa 256(%rdi), %ymm12
-; AVX512DQ-BW-NEXT:    movw $9288, %ax # imm = 0x2448
-; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm0 {%k3}
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm17 {%k5}
+; AVX512DQ-BW-NEXT:    vmovdqa 288(%rdi), %ymm14
+; AVX512DQ-BW-NEXT:    vmovdqa 256(%rdi), %ymm13
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm14, %ymm13, %ymm0 {%k6}
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
-; AVX512DQ-BW-NEXT:    vporq %xmm5, %xmm0, %xmm19
-; AVX512DQ-BW-NEXT:    vmovdqa64 352(%rdi), %ymm17
+; AVX512DQ-BW-NEXT:    vporq %xmm5, %xmm0, %xmm20
+; AVX512DQ-BW-NEXT:    vmovdqa64 352(%rdi), %ymm18
 ; AVX512DQ-BW-NEXT:    vmovdqa 320(%rdi), %ymm0
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm5 {%k6}
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm18, %ymm0, %ymm5 {%k3}
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm11 = ymm5[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm11[2],ymm5[3,4,5],ymm11[6],ymm5[7,8,9],ymm11[10],ymm5[11,12,13],ymm11[14],ymm5[15]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    movw $3968, %ax # imm = 0xF80
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k7
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm5, %ymm19 {%k7}
-; AVX512DQ-BW-NEXT:    vmovdqa 416(%rdi), %ymm15
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm5, %ymm20 {%k7}
+; AVX512DQ-BW-NEXT:    vmovdqa64 416(%rdi), %ymm16
 ; AVX512DQ-BW-NEXT:    vmovdqa 384(%rdi), %ymm5
 ; AVX512DQ-BW-NEXT:    movw $4644, %ax # imm = 0x1224
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k4
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm20 {%k4}
-; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm22
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
-; AVX512DQ-BW-NEXT:    vporq %xmm22, %xmm20, %xmm20
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm16, %ymm5, %ymm11 {%k4}
+; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm11, %xmm19
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero
+; AVX512DQ-BW-NEXT:    vporq %xmm19, %xmm11, %xmm11
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm11, %ymm0, %ymm21
 ; AVX512DQ-BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm22 {%k4}
-; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm23
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vporq %xmm23, %xmm22, %xmm22
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm7, %ymm11 {%k4}
+; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm11, %xmm19
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vporq %xmm19, %xmm11, %xmm19
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm10, %ymm11 {%k2}
+; AVX512DQ-BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm11, %xmm15
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7,8,9],ymm15[10],ymm11[11,12,13],ymm15[14],ymm11[15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    movl $511, %edi # imm = 0x1FF
 ; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm22, %ymm10 {%k1}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm11, %ymm6, %ymm22 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u]
-; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm22
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vporq %xmm23, %xmm22, %xmm22
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vporq %xmm14, %xmm22, %xmm14
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm10 {%k5}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm2 {%k6}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm14, %xmm2
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm19, %ymm11 {%k1}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm12, %ymm6, %ymm15 {%k6}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm15, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,xmm15[4,11],zero,zero,xmm15[0,7,14,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vporq %xmm19, %xmm15, %xmm15
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm19 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm24[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm25[1,8,15],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vporq %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm15, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm11 {%k5}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm10, %ymm3 {%k4}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm15
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8,9,10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm7, %ymm15 {%k3}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm15[2,9],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm15, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vporq %xmm19, %xmm15, %xmm15
 ; AVX512DQ-BW-NEXT:    movl $261632, %edi # imm = 0x3FE00
 ; AVX512DQ-BW-NEXT:    kmovd %edi, %k5
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm14 {%k2}
-; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm14, %xmm22
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
-; AVX512DQ-BW-NEXT:    vporq %xmm22, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm21, %xmm8, %xmm21
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vporq %xmm14, %xmm21, %xmm14
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm3, %zmm23
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm2, %zmm23 {%k1}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm2 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm3 {%k4}
-; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm3, %xmm18
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm15 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm12, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm3, %xmm19
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u]
+; AVX512DQ-BW-NEXT:    vporq %xmm19, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT:    vpshufb %xmm22, %xmm9, %xmm19
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm19 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm25[2,9],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vporq %xmm4, %xmm19, %xmm4
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm4, %zmm3, %zmm22
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm15, %zmm22 {%k1}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm10, %ymm3 {%k3}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm7, %ymm4 {%k6}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm4[3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm15, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm12, %ymm3 {%k4}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vporq %xmm18, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm15, %xmm3
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-NEXT:    vporq %xmm18, %xmm21, %xmm18
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm19 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-NEXT:    vporq %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512DQ-BW-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
 ; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
 ; AVX512DQ-BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm18, %ymm3 {%k2}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vporq %xmm18, %xmm21, %xmm18
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm18, %zmm3, %zmm18
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm2, %zmm18 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm15, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm25[3,10],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vporq %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm15, %zmm3, %zmm19
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm4, %zmm19 {%k1}
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm20, %ymm19 {%k2}
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm21, %ymm20 {%k2}
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm0, %zmm3
 ; AVX512DQ-BW-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm16 {%k1}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k4}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm3 {%k3}
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm3 {%k6}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm10 {%k1}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k6}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm3 {%k4}
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm3 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm17 {%k1}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm13, %ymm14, %ymm3 {%k4}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm18, %ymm0, %ymm4 {%k6}
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm15[3],ymm4[4,5],ymm15[6],ymm4[7,8,9,10],ymm15[11],ymm4[12,13],ymm15[14],ymm4[15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm16, %ymm5, %ymm4 {%k3}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10]
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm15, %xmm4
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm11 {%k1}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm13, %ymm14, %ymm3 {%k3}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm18, %ymm4 {%k4}
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1,2],ymm15[3],ymm4[4,5,6],ymm15[7,8],ymm4[9,10],ymm15[11],ymm4[12,13,14],ymm15[15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm16, %ymm5, %ymm4 {%k6}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11]
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm15, %xmm4
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm22 {%k1}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm13, %ymm14, %ymm3 {%k6}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u]
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm23 {%k1}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm3 {%k6}
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm3 {%k4}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm18 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm18, %ymm4 {%k3}
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1,2,3],ymm15[4],ymm4[5,6],ymm15[7,8],ymm4[9,10,11],ymm15[12],ymm4[13,14],ymm15[15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm16, %ymm4 {%k4}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm4, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm15, %xmm4
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm19 {%k1}
 ; AVX512DQ-BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm2 {%k1}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
-; AVX512DQ-BW-NEXT:    vporq %xmm3, %xmm2, %xmm19
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm2 {%k3}
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm2, %ymm19 {%k7}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm2 {%k4}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm3 {%k1}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm14, %ymm13, %ymm3 {%k1}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
+; AVX512DQ-BW-NEXT:    vporq %xmm4, %xmm3, %xmm20
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm18, %ymm3 {%k6}
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm20 {%k7}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm14, %ymm13, %ymm3 {%k4}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm18, %ymm0, %ymm4 {%k1}
 ; AVX512DQ-BW-NEXT:    kmovq %k1, %k7
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4],ymm15[5],ymm4[6,7,8],ymm15[9],ymm4[10,11,12],ymm15[13],ymm4[14,15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm21 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    movl $8176, %eax # imm = 0x1FF0
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm2, %ymm20 {%k1}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm2 {%k6}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm2, %ymm19 {%k2}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm2 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm2, %ymm20 {%k2}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm22 {%k6}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm9, %ymm1, %ymm21 {%k6}
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm17, %ymm0 {%k4}
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm13, %ymm12 {%k6}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm12, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm21 {%k1}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm16, %ymm3 {%k3}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm20 {%k2}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm16, %ymm3 {%k6}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14]
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm21 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm18, %ymm0 {%k4}
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm14, %ymm13 {%k3}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm0, %xmm4, %xmm0
 ; AVX512DQ-BW-NEXT:    movl $4186112, %eax # imm = 0x3FE000
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm15, %ymm5 {%k7}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm9, %ymm1, %ymm2 {%k4}
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm3 {%k3}
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm9, %ymm1 {%k3}
-; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm11, %ymm6 {%k4}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm11
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm11, %xmm9
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm16, %ymm5 {%k7}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm10, %ymm15 {%k6}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm12, %ymm3 {%k3}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm10, %ymm2, %ymm14 {%k4}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm7, %ymm1, %ymm13 {%k3}
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm10, %ymm2 {%k3}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm7, %ymm1, %ymm4 {%k4}
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm12, %ymm10 {%k6}
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm7, %ymm1 {%k6}
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm12, %ymm6 {%k4}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm11, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm3, %xmm3
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
-; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm11, %ymm3 {%k1}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm6, %xmm11
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm12, %xmm7
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm7, %ymm3 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm10[u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm10, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm10, %xmm7
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
+; AVX512DQ-BW-NEXT:    vpor %xmm10, %xmm12, %xmm10
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm10, %ymm7 {%k1}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm6, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm11, %xmm6
+; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm10, %xmm6
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
-; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k1}
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
-; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm7, %zmm7
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
-; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm8, %zmm8
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm11, %zmm11
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm12
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm12, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm4, %xmm12
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm2 {%k5}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
+; AVX512DQ-BW-NEXT:    vpor %xmm8, %xmm9, %xmm8
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm8, %ymm6 {%k1}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm15, %xmm8
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm4, %xmm9
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm9, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm25, %xmm9
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm24[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm4 {%k5}
 ; AVX512DQ-BW-NEXT:    movw $-512, %ax # imm = 0xFE00
-; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm21, %xmm9
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm14, %xmm3
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13,14,15]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm9
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm12, %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm10, %xmm9
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm24[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm25[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm9 {%k5}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm26, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm6, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm1 {%k5}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm24, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm25[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm6, %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm2, %zmm1 {%k5}
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vinserti32x8 $1, %ymm19, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT:    vinserti32x8 $1, %ymm20, %zmm0, %zmm9 {%k1}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vinserti32x8 $1, %ymm20, %zmm0, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT:    vinserti32x8 $1, %ymm21, %zmm0, %zmm9 {%k1}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, (%r8)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, (%r8)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, (%r9)
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, (%rdi)
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
@@ -14449,39 +14501,25 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    movw $-28382, %ax # imm = 0x9122
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm0 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %k1, %k2
 ; AVX512DQ-BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,17,10,3,20,13,6,0,24,0,0,27,0,0,0,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm4, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm9
 ; AVX512DQ-BW-FCP-NEXT:    movw $8772, %ax # imm = 0x2244
@@ -14489,71 +14527,75 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %k1, %k3
 ; AVX512DQ-BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm6, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 240(%rdi), %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm12, %ymm5, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 240(%rdi), %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm16[0,7,14],zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm4, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm1 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k5}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm4
 ; AVX512DQ-BW-FCP-NEXT:    movw $9288, %ax # imm = 0x2448
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm4, %ymm7 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm7, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm4, %ymm5 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm7, %xmm5, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,1,0,0,4,0,0,0,8,25,18,11,0,29,22,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movw $3968, %ax # imm = 0xF80
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm21 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm17 {%k7}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm5
 ; AVX512DQ-BW-FCP-NEXT:    movw $4644, %ax # imm = 0x1224
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k4
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm5, %ymm18 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm5, %ymm8 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm8, %ymm0, %ymm18
 ; AVX512DQ-BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm8 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm8, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,17,10,3,0,21,14,7,24,0,0,0,28,0,0,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm8, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movl $511, %r10d # imm = 0x1FF
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm8 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm18, %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm14 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm12, %ymm15, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm18, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm14, %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm8 {%k5}
 ; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
@@ -14561,144 +14603,154 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [0,0,18,11,4,21,14,7,0,25,0,0,28,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm15, %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    movl $261632, %r10d # imm = 0x3FE00
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm15, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm15 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,3,5,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm12, %ymm19, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm16[2,9],zero,zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm13[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm14, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [8,1,18,11,4,0,22,15,0,25,0,0,0,29,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm19 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm14 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 208(%rdi), %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm23, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm14, %ymm0, %ymm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm22, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
 ; AVX512DQ-BW-FCP-NEXT:    movl $-134217728, %r10d # imm = 0xF8000000
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm13 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm13, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm20 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[3,10],zero,zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm20, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm19, %zmm13 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm17 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm0, %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm6, %ymm14 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm0 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm6, %ymm16 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero,xmm16[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [0,1,0,0,0,5,0,0,8,0,26,19,12,29,22,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm16 {%k7}
 ; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm5, %ymm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm8 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm6, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm5, %ymm15 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm6, %ymm14 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm7, %ymm15 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm13 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm5, %ymm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm17[5,12],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14],zero,zero,xmm17[3,10]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm6, %ymm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [0,0,2,0,0,5,0,0,16,9,26,19,12,0,30,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm5, %ymm17 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm6, %ymm16 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm16[u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [0,0,2,0,0,0,6,0,16,9,0,27,20,13,30,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm7, %ymm17 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,4,11],zero,zero,xmm17[0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm13 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm4, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm16, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm4, %ymm16 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm4, %ymm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [0,0,0,3,0,0,6,0,24,17,10,27,20,13,0,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm4, %ymm17 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,1,8,15],zero,zero,xmm17[4,11],zero,zero,xmm17[u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [0,0,0,3,0,0,0,7,24,17,10,0,28,21,14,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movl $8176, %eax # imm = 0x1FF0
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm17 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm7, %ymm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm7, %ymm18 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm7, %ymm16 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,5,12],zero,zero,xmm18[1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm7, %ymm18 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm18[u,u,u,u,u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm17 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm18 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k6}
 ; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
@@ -14708,8 +14760,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm21, %xmm11
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512DQ-BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
@@ -14719,8 +14771,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm20, %xmm11
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm20, %xmm21, %xmm20
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm20, %ymm11 {%k2}
@@ -14728,80 +14780,80 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm20, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm9, %ymm0, %ymm20
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm14, %xmm9
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm9 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm17, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm18, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm20, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm2, %xmm19, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm20, %ymm19, %ymm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm19, %zmm10, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm2 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm9, %ymm20 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm14, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [8,1,0,19,12,5,22,15,0,0,26,0,0,29,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,2,4,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 224(%rdi), %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm14, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm10, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm9 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm10
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm10, %xmm16, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm20, %ymm16, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm16, %zmm11, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm14, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,0,23,0,0,26,0,0,0,30,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,3,4,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm14, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm11, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm11, %zmm10 {%k5}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm20, %ymm11, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm9, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm3 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [16,9,2,0,20,13,6,23,0,0,0,27,0,0,30,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm11, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,5,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm20, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k5}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,4,0,0,7,0,25,18,11,28,21,14,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    movl $4186112, %eax # imm = 0x3FE000
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm5 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm15, %zmm0, %zmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm5, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm16, %zmm0, %zmm9 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm17, %zmm0, %zmm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm4, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rdi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 99932c0026b23..950d18719c6f2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -2726,108 +2726,108 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm0
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm3
-; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512-NEXT:    vpmovqb %zmm5, %xmm6
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
-; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512-NEXT:    vpmovqb %zmm4, %xmm5
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
+; AVX512-NEXT:    vpmovqb %zmm2, %xmm5
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm6
+; AVX512-NEXT:    vpshufb %xmm5, %xmm6, %xmm7
+; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
-; AVX512-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
+; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
+; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512-NEXT:    vpsrlq $8, %zmm5, %zmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
+; AVX512-NEXT:    vpsrlq $8, %zmm2, %zmm7
 ; AVX512-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
-; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
+; AVX512-NEXT:    vpshufb %xmm7, %xmm6, %xmm8
+; AVX512-NEXT:    vpshufb %xmm7, %xmm4, %xmm7
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
-; AVX512-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
+; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
+; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512-NEXT:    vpsrlq $16, %zmm5, %zmm8
+; AVX512-NEXT:    vpsrlq $16, %zmm2, %zmm8
 ; AVX512-NEXT:    vpmovqb %zmm8, %xmm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
-; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
+; AVX512-NEXT:    vpshufb %xmm8, %xmm6, %xmm9
+; AVX512-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm2, %xmm10
-; AVX512-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
+; AVX512-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
+; AVX512-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT:    vpsrlq $24, %zmm5, %zmm9
+; AVX512-NEXT:    vpsrlq $24, %zmm2, %zmm9
 ; AVX512-NEXT:    vpmovqb %zmm9, %xmm9
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
-; AVX512-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
+; AVX512-NEXT:    vpshufb %xmm9, %xmm6, %xmm10
+; AVX512-NEXT:    vpshufb %xmm9, %xmm4, %xmm9
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm11
-; AVX512-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
+; AVX512-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
+; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512-NEXT:    vpsrlq $32, %zmm5, %zmm10
+; AVX512-NEXT:    vpsrlq $32, %zmm2, %zmm10
 ; AVX512-NEXT:    vpmovqb %zmm10, %xmm10
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
-; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512-NEXT:    vpshufb %xmm10, %xmm6, %xmm11
+; AVX512-NEXT:    vpshufb %xmm10, %xmm4, %xmm10
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
-; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
+; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
+; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512-NEXT:    vpsrlq $40, %zmm5, %zmm11
+; AVX512-NEXT:    vpsrlq $40, %zmm2, %zmm11
 ; AVX512-NEXT:    vpmovqb %zmm11, %xmm11
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
-; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
+; AVX512-NEXT:    vpshufb %xmm11, %xmm6, %xmm12
+; AVX512-NEXT:    vpshufb %xmm11, %xmm4, %xmm11
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm13
-; AVX512-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
+; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm13
+; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm12
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512-NEXT:    vpsrlq $48, %zmm5, %zmm12
+; AVX512-NEXT:    vpsrlq $48, %zmm2, %zmm12
 ; AVX512-NEXT:    vpmovqb %zmm12, %xmm12
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
+; AVX512-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT:    vpsrlq $56, %zmm5, %zmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512-NEXT:    vpsrlq $56, %zmm2, %zmm1
 ; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm6, (%rdx)
+; AVX512-NEXT:    vmovdqa %xmm3, (%rsi)
+; AVX512-NEXT:    vmovdqa %xmm5, (%rdx)
 ; AVX512-NEXT:    vmovdqa %xmm7, (%rcx)
 ; AVX512-NEXT:    vmovdqa %xmm8, (%r8)
 ; AVX512-NEXT:    vmovdqa %xmm9, (%r9)
@@ -2842,74 +2842,76 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpmovqd %ymm9, %xmm8
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm10
-; AVX512-FCP-NEXT:    vpmovqd %ymm10, %xmm11
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm11, %xmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpmovqd %ymm11, %xmm7
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [8,10,12,14,12,14,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512-FCP-NEXT:    vpermd %zmm13, %zmm1, %zmm14
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX512-FCP-NEXT:    vpmovqb %zmm12, %xmm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm8, %xmm1
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm1
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512-FCP-NEXT:    vpsrlq $8, %zmm12, %zmm2
-; AVX512-FCP-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm2
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm12, %zmm14
-; AVX512-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
+; AVX512-FCP-NEXT:    vpsrlq $8, %zmm12, %zmm3
+; AVX512-FCP-NEXT:    vpmovqb %zmm3, %xmm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm3
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm12, %zmm8
+; AVX512-FCP-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
-; AVX512-FCP-NEXT:    vpsrlq $24, %zmm12, %zmm11
-; AVX512-FCP-NEXT:    vpmovqb %zmm11, %xmm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm9, %ymm11, %ymm9
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
-; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm12, %zmm4
-; AVX512-FCP-NEXT:    vpmovqb %zmm4, %xmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm9, %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512-FCP-NEXT:    vpsrlq $40, %zmm12, %zmm5
-; AVX512-FCP-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm10, %xmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3]
+; AVX512-FCP-NEXT:    vpsrlq $24, %zmm12, %zmm14
+; AVX512-FCP-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [9,11,13,15,13,15,14,15]
+; AVX512-FCP-NEXT:    vpermd %zmm13, %zmm14, %zmm13
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm9
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,3,5,7,0,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm14, %ymm11
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm9[3]
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm12, %zmm9
+; AVX512-FCP-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT:    vpsrlq $40, %zmm12, %zmm6
+; AVX512-FCP-NEXT:    vpmovqb %zmm6, %xmm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
 ; AVX512-FCP-NEXT:    vpsrlq $48, %zmm12, %zmm6
 ; AVX512-FCP-NEXT:    vpmovqb %zmm6, %xmm6
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm10, %xmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512-FCP-NEXT:    vpsrlq $56, %zmm12, %zmm7
-; AVX512-FCP-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT:    vpsrlq $56, %zmm12, %zmm8
+; AVX512-FCP-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%r9)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%r11)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%r10)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%rax)
@@ -2921,108 +2923,108 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512DQ-NEXT:    vpmovqb %zmm5, %xmm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-NEXT:    vpmovqb %zmm4, %xmm5
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
+; AVX512DQ-NEXT:    vpmovqb %zmm2, %xmm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm6, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512DQ-NEXT:    vpsrlq $8, %zmm5, %zmm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
+; AVX512DQ-NEXT:    vpsrlq $8, %zmm2, %zmm7
 ; AVX512DQ-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm6, %xmm8
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm4, %xmm7
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm5, %zmm8
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm2, %zmm8
 ; AVX512DQ-NEXT:    vpmovqb %zmm8, %xmm8
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm6, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm2, %xmm10
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT:    vpsrlq $24, %zmm5, %zmm9
+; AVX512DQ-NEXT:    vpsrlq $24, %zmm2, %zmm9
 ; AVX512DQ-NEXT:    vpmovqb %zmm9, %xmm9
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm6, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm4, %xmm9
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm2, %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512DQ-NEXT:    vpsrlq $32, %zmm5, %zmm10
+; AVX512DQ-NEXT:    vpsrlq $32, %zmm2, %zmm10
 ; AVX512DQ-NEXT:    vpmovqb %zmm10, %xmm10
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm6, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm4, %xmm10
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512DQ-NEXT:    vpsrlq $40, %zmm5, %zmm11
+; AVX512DQ-NEXT:    vpsrlq $40, %zmm2, %zmm11
 ; AVX512DQ-NEXT:    vpmovqb %zmm11, %xmm11
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm6, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm4, %xmm11
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm2, %xmm13
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm13
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm12
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512DQ-NEXT:    vpsrlq $48, %zmm5, %zmm12
+; AVX512DQ-NEXT:    vpsrlq $48, %zmm2, %zmm12
 ; AVX512DQ-NEXT:    vpmovqb %zmm12, %xmm12
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vpsrlq $56, %zmm5, %zmm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512DQ-NEXT:    vpsrlq $56, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %xmm6, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rsi)
+; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm7, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm8, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa %xmm9, (%r9)
@@ -3037,74 +3039,76 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpmovqd %ymm9, %xmm8
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT:    vpmovqd %ymm10, %xmm11
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm11, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT:    vpmovqd %ymm11, %xmm7
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [8,10,12,14,12,14,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT:    vpermd %zmm13, %zmm1, %zmm14
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm12, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm8, %xmm1
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $8, %zmm12, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm12, %zmm14
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $8, %zmm12, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm12, %zmm8
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $24, %zmm12, %zmm11
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm11, %xmm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm9, %ymm11, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm12, %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm9, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $40, %zmm12, %zmm5
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm10, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $24, %zmm12, %zmm14
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm13, %zmm14, %zmm13
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm9
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [1,3,5,7,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm14, %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm12, %zmm9
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $40, %zmm12, %zmm6
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
 ; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm12, %zmm6
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm6, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm10, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $56, %zmm12, %zmm7
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $56, %zmm12, %zmm8
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%r11)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%r10)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%rax)
@@ -3117,107 +3121,107 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512BW-NEXT:    vpmovqb %zmm5, %xmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
-; AVX512BW-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-NEXT:    vpmovqb %zmm4, %xmm5
+; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
+; AVX512BW-NEXT:    vpmovqb %zmm2, %xmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm6
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm6, %xmm7
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512BW-NEXT:    vpsrlq $8, %zmm5, %zmm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
+; AVX512BW-NEXT:    vpsrlq $8, %zmm2, %zmm7
 ; AVX512BW-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm6, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm4, %xmm7
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512BW-NEXT:    vpsrlq $16, %zmm5, %zmm8
+; AVX512BW-NEXT:    vpsrlq $16, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vpmovqb %zmm8, %xmm8
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm6, %xmm9
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    vpshufb %xmm9, %xmm2, %xmm10
-; AVX512BW-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512BW-NEXT:    vpsrlq $24, %zmm5, %zmm9
+; AVX512BW-NEXT:    vpsrlq $24, %zmm2, %zmm9
 ; AVX512BW-NEXT:    vpmovqb %zmm9, %xmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
-; AVX512BW-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm6, %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm4, %xmm9
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm2, %xmm11
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512BW-NEXT:    vpsrlq $32, %zmm5, %zmm10
+; AVX512BW-NEXT:    vpsrlq $32, %zmm2, %zmm10
 ; AVX512BW-NEXT:    vpmovqb %zmm10, %xmm10
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm6, %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm10
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
-; AVX512BW-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512BW-NEXT:    vpsrlq $40, %zmm5, %zmm11
+; AVX512BW-NEXT:    vpsrlq $40, %zmm2, %zmm11
 ; AVX512BW-NEXT:    vpmovqb %zmm11, %xmm11
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
-; AVX512BW-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm11, %xmm6, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm11, %xmm4, %xmm11
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm13
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm0, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm1, %xmm12
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512BW-NEXT:    vpsrlq $48, %zmm5, %zmm12
+; AVX512BW-NEXT:    vpsrlq $48, %zmm2, %zmm12
 ; AVX512BW-NEXT:    vpmovqb %zmm12, %xmm12
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vpsrlq $56, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512BW-NEXT:    vpsrlq $56, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512BW-NEXT:    vmovdqa %xmm6, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %xmm3, (%rsi)
+; AVX512BW-NEXT:    vmovdqa %xmm5, (%rdx)
 ; AVX512BW-NEXT:    vmovdqa %xmm7, (%rcx)
 ; AVX512BW-NEXT:    vmovdqa %xmm8, (%r8)
 ; AVX512BW-NEXT:    vmovdqa %xmm9, (%r9)
@@ -3232,67 +3236,69 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-FCP-NEXT:    vmovdqa 96(%rdi), %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX512BW-FCP-NEXT:    vpmovqd %ymm4, %xmm3
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm0
-; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vpmovqd %ymm6, %xmm7
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm9
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm9, %xmm1
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [8,10,12,14,12,14,14,15]
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
+; AVX512BW-FCP-NEXT:    vpermd %zmm7, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm1
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm6, %xmm1
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm1
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $8, %zmm9, %zmm2
+; AVX512BW-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $8, %zmm6, %zmm2
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm2
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $16, %zmm9, %zmm14
+; AVX512BW-FCP-NEXT:    vpshufb %xmm13, %xmm8, %xmm14
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm14[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $16, %zmm6, %zmm14
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm9, %zmm7
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7]
-; AVX512BW-FCP-NEXT:    vpermd %ymm4, %ymm7, %ymm4
+; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm6, %zmm8
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [9,11,13,15,13,15,14,15]
+; AVX512BW-FCP-NEXT:    vpermd %zmm7, %zmm8, %zmm7
+; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm8
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,3,5,7,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermd %ymm4, %ymm9, %ymm4
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX512BW-FCP-NEXT:    vpermd %ymm6, %ymm7, %ymm6
-; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm7
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $32, %zmm9, %zmm7
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm7
-; AVX512BW-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm8
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $40, %zmm9, %zmm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $32, %zmm6, %zmm8
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm8, %xmm8
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm4, %xmm8
-; AVX512BW-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $48, %zmm9, %zmm10
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm8
+; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm9
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $40, %zmm6, %zmm9
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm9
+; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm4, %xmm10
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $48, %zmm6, %zmm10
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $56, %zmm9, %zmm6
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $56, %zmm6, %zmm6
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm6, %xmm6
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
@@ -3300,8 +3306,8 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm7, (%r11)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm8, (%r10)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm8, (%r11)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm9, (%r10)
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -3312,107 +3318,107 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT:    vmovdqa 112(%rdi), %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
-; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm5, %xmm6
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
-; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm4, %xmm5
+; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm2, %xmm5
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT:    vmovdqa 112(%rdi), %xmm6
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm6, %xmm7
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
+; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $8, %zmm5, %zmm7
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $8, %zmm2, %zmm7
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
+; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm6, %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm4, %xmm7
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $16, %zmm5, %zmm8
+; AVX512DQ-BW-NEXT:    vpsrlq $16, %zmm2, %zmm8
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm8, %xmm8
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm6, %xmm9
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm2, %xmm10
-; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
+; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $24, %zmm5, %zmm9
+; AVX512DQ-BW-NEXT:    vpsrlq $24, %zmm2, %zmm9
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm9, %xmm9
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
-; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
+; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm6, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm4, %xmm9
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm2, %xmm11
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm5, %zmm10
+; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm2, %zmm10
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm10, %xmm10
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm6, %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm10
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
-; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $40, %zmm5, %zmm11
+; AVX512DQ-BW-NEXT:    vpsrlq $40, %zmm2, %zmm11
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm11, %xmm11
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
-; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm6, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm4, %xmm11
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm0, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm1, %xmm12
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $48, %zmm5, %zmm12
+; AVX512DQ-BW-NEXT:    vpsrlq $48, %zmm2, %zmm12
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm12, %xmm12
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $56, %zmm5, %zmm1
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $56, %zmm2, %zmm1
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rsi)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%rdx)
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm7, (%rcx)
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm8, (%r8)
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm9, (%r9)
@@ -3427,67 +3433,69 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 96(%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm4, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm6, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm9, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [8,10,12,14,12,14,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm7, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm6, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $8, %zmm9, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $8, %zmm6, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %zmm9, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm13, %xmm8, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm14[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %zmm6, %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm9, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm4, %ymm7, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm6, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm7, %zmm8, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,3,5,7,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm4, %ymm9, %ymm4
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm6, %ymm7, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %zmm9, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $40, %zmm9, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %zmm6, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm4, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %zmm9, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $40, %zmm6, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm4, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %zmm6, %zmm10
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $56, %zmm9, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $56, %zmm6, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm6, %xmm6
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
@@ -3495,8 +3503,8 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm7, (%r11)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm8, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm8, (%r11)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm9, (%r10)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
@@ -5975,215 +5983,223 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-LABEL: load_i8_stride8_vf32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm16
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm3
+; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm4
+; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm14
+; AVX512-NEXT:    vpshufb %xmm5, %xmm14, %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm11
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm11, %xmm6
+; AVX512-NEXT:    vpmovqb %zmm3, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512-NEXT:    vpmovqb %ymm4, %xmm4
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512-NEXT:    vpmovqb %ymm6, %xmm9
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
+; AVX512-NEXT:    vpmovqb %ymm6, %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm1
+; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm18
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm19
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm7
+; AVX512-NEXT:    vpshufb %xmm8, %xmm7, %xmm8
 ; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
+; AVX512-NEXT:    vpmovqb %zmm16, %xmm5
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm17
+; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm9
+; AVX512-NEXT:    vmovdqa 160(%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 176(%rdi), %xmm10
 ; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm12
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
-; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm7
-; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm5
+; AVX512-NEXT:    vmovdqa %xmm11, %xmm6
+; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512-NEXT:    vpshufb %xmm13, %xmm2, %xmm11
+; AVX512-NEXT:    vmovdqa %xmm14, %xmm4
+; AVX512-NEXT:    vpshufb %xmm13, %xmm14, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-NEXT:    vmovd {{.*#+}} xmm11 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm10, %xmm14
+; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm11
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm22
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm11
+; AVX512-NEXT:    vmovd {{.*#+}} xmm15 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vpshufb %xmm15, %xmm11, %xmm0
+; AVX512-NEXT:    vpshufb %xmm15, %xmm9, %xmm15
+; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm20
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm3
+; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm24
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm7, %xmm8
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm9
+; AVX512-NEXT:    vpshufb %xmm13, %xmm9, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm3
+; AVX512-NEXT:    vpshufb %xmm13, %xmm3, %xmm13
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3]
+; AVX512-NEXT:    vpsrlq $8, %zmm16, %zmm5
+; AVX512-NEXT:    vpmovqb %zmm5, %xmm5
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm12, %xmm1
+; AVX512-NEXT:    vpshufb %xmm7, %xmm6, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm21
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm13
+; AVX512-NEXT:    vmovdqa %xmm2, %xmm6
+; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm9
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm9, %xmm2
-; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512-NEXT:    vpshufb %xmm4, %xmm8, %xmm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vmovd {{.*#+}} xmm13 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vpshufb %xmm13, %xmm10, %xmm15
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm0
+; AVX512-NEXT:    vpshufb %xmm13, %xmm0, %xmm13
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm23
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512-NEXT:    vmovd {{.*#+}} xmm15 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vpshufb %xmm15, %xmm11, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm14
+; AVX512-NEXT:    vpshufb %xmm15, %xmm14, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX512-NEXT:    vpmovqb %ymm2, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm5
-; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm6
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm1
-; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm18
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm1
-; AVX512-NEXT:    vpshufb %xmm4, %xmm1, %xmm11
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm21
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm3
-; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
-; AVX512-NEXT:    vpmovqb %zmm16, %xmm10
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512-NEXT:    vmovdqa 160(%rdi), %xmm10
-; AVX512-NEXT:    vmovdqa 176(%rdi), %xmm6
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm19
-; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm12
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm9, %xmm12
-; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm22
-; AVX512-NEXT:    vpshufb %xmm14, %xmm8, %xmm13
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm20
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT:    vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm6, %xmm13
-; AVX512-NEXT:    vpshufb %xmm12, %xmm10, %xmm12
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm13
-; AVX512-NEXT:    vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm13, %xmm0
-; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm26
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm1
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm24
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm5
-; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm7, %xmm8, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm4
-; AVX512-NEXT:    vpshufb %xmm14, %xmm4, %xmm2
-; AVX512-NEXT:    vpshufb %xmm14, %xmm3, %xmm4
-; AVX512-NEXT:    vmovdqa %xmm3, %xmm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX512-NEXT:    vpshufb %xmm5, %xmm9, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm22
+; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm20
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512-NEXT:    vpsrlq $8, %zmm16, %zmm2
+; AVX512-NEXT:    vpsrlq $16, %zmm16, %zmm2
 ; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm3
-; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
-; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm7, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm8
-; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm7
-; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm14
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm6, %xmm14
-; AVX512-NEXT:    vmovdqa %xmm6, %xmm12
-; AVX512-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512-NEXT:    vmovd {{.*#+}} xmm14 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm6
-; AVX512-NEXT:    vpshufb %xmm14, %xmm6, %xmm14
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm4
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm20
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm6
-; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512-NEXT:    vpsrlq $16, %zmm16, %zmm2
-; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm19
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm25
+; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm9
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
-; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm14
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm22
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm28
+; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm13
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm21
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm12, %xmm14
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm27
-; AVX512-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512-NEXT:    vmovd {{.*#+}} xmm14 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm12
-; AVX512-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm5
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vmovd {{.*#+}} xmm5 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vmovdqa %xmm10, %xmm6
+; AVX512-NEXT:    vpshufb %xmm5, %xmm10, %xmm13
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; AVX512-NEXT:    vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vpshufb %xmm13, %xmm11, %xmm15
+; AVX512-NEXT:    vmovdqa %xmm14, %xmm10
+; AVX512-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5],ymm13[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm14
+; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm5
+; AVX512-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm29
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm4
+; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm5
 ; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm15
-; AVX512-NEXT:    vpshufb %xmm0, %xmm15, %xmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm11, %xmm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
 ; AVX512-NEXT:    vpsrlq $24, %zmm16, %zmm2
 ; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm20
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm11
-; AVX512-NEXT:    vpshufb %xmm3, %xmm11, %xmm1
-; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm9, %xmm7
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm12, %xmm1
+; AVX512-NEXT:    vpshufb %xmm8, %xmm9, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm24
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm9
-; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm14
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm0
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm9
+; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm13
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm14
-; AVX512-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm22
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512-NEXT:    vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm13, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm25
-; AVX512-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT:    vmovd {{.*#+}} xmm5 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vmovdqa %xmm6, %xmm7
+; AVX512-NEXT:    vpshufb %xmm5, %xmm6, %xmm13
+; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm25
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; AVX512-NEXT:    vmovd {{.*#+}} xmm13 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vpshufb %xmm13, %xmm11, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm10, %xmm6
+; AVX512-NEXT:    vpshufb %xmm13, %xmm10, %xmm13
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm1
-; AVX512-NEXT:    vpshufb %xmm3, %xmm15, %xmm3
+; AVX512-NEXT:    vpshufb %xmm8, %xmm14, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm26
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm3
+; AVX512-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm10
-; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
-; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm23
+; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm3
+; AVX512-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm23
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
 ; AVX512-NEXT:    vpsrlq $32, %zmm16, %zmm2
@@ -6191,43 +6207,46 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512-NEXT:    vpshufb %xmm5, %xmm11, %xmm1
-; AVX512-NEXT:    vpshufb %xmm5, %xmm7, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm7, %xmm13
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm12, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm27
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm10
+; AVX512-NEXT:    vpshufb %xmm4, %xmm10, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm8
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm26
-; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm4
-; AVX512-NEXT:    vmovdqa %xmm9, %xmm14
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm5
+; AVX512-NEXT:    vmovdqa %xmm9, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm6
-; AVX512-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm7
+; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm5
+; AVX512-NEXT:    vmovdqa %xmm7, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm7
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-NEXT:    vmovd {{.*#+}} xmm4 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm9
-; AVX512-NEXT:    vpshufb %xmm4, %xmm9, %xmm0
-; AVX512-NEXT:    vpshufb %xmm4, %xmm12, %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT:    vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vpshufb %xmm5, %xmm11, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm24
+; AVX512-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX512-NEXT:    vmovdqa %xmm6, %xmm13
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm8
-; AVX512-NEXT:    vpshufb %xmm5, %xmm8, %xmm1
-; AVX512-NEXT:    vpshufb %xmm5, %xmm15, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm11
+; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm5
+; AVX512-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm24
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm4
-; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm6
+; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm9
+; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
 ; AVX512-NEXT:    vpsrlq $40, %zmm16, %zmm2
@@ -6235,41 +6254,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm11, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm23
-; AVX512-NEXT:    vpshufb %xmm10, %xmm13, %xmm2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm0
+; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm1
+; AVX512-NEXT:    vpshufb %xmm12, %xmm10, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm11
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
-; AVX512-NEXT:    vpshufb %xmm2, %xmm14, %xmm5
-; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm25
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
+; AVX512-NEXT:    vpshufb %xmm2, %xmm15, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm25
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm6, %xmm5
+; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512-NEXT:    vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm5, %xmm9, %xmm0
-; AVX512-NEXT:    vmovdqa %xmm9, %xmm14
-; AVX512-NEXT:    vpshufb %xmm5, %xmm12, %xmm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm23
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-NEXT:    vmovd {{.*#+}} xmm4 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm7
+; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm0
+; AVX512-NEXT:    vpshufb %xmm4, %xmm13, %xmm4
+; AVX512-NEXT:    vmovdqa %xmm13, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm8, %xmm1
-; AVX512-NEXT:    vmovdqa %xmm8, %xmm9
-; AVX512-NEXT:    vpshufb %xmm10, %xmm15, %xmm3
+; AVX512-NEXT:    vpshufb %xmm12, %xmm11, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm13
+; AVX512-NEXT:    vpshufb %xmm12, %xmm5, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm11
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm10
-; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
-; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm4, %xmm8
+; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
+; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
 ; AVX512-NEXT:    vpsrlq $48, %zmm16, %zmm2
@@ -6277,12 +6296,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm2
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT:    vpshufb %xmm1, %xmm13, %xmm3
+; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm11, %xmm4
+; AVX512-NEXT:    vpshufb %xmm3, %xmm8, %xmm4
 ; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm5
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
@@ -6290,22 +6309,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm6, %xmm5
-; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
+; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm8
+; AVX512-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm5, %xmm14, %xmm7
-; AVX512-NEXT:    vpshufb %xmm5, %xmm12, %xmm5
+; AVX512-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
+; AVX512-NEXT:    vpshufb %xmm5, %xmm15, %xmm5
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm4
-; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm13, %xmm4
+; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
-; AVX512-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
+; AVX512-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
+; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
 ; AVX512-NEXT:    vpsrlq $56, %zmm16, %zmm3
@@ -6328,184 +6348,188 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512-FCP-LABEL: load_i8_stride8_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm20
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm19
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm16
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm17
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm18
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm16
-; AVX512-FCP-NEXT:    vpermd %ymm16, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm3
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm18
-; AVX512-FCP-NEXT:    vpermd %ymm18, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,10,10,11,8,10,12,14]
+; AVX512-FCP-NEXT:    vpermd %zmm18, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm4, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm14
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm6, %ymm0
+; AVX512-FCP-NEXT:    vpermd %zmm17, %zmm4, %zmm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpmovqd %ymm12, %xmm7
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm19
-; AVX512-FCP-NEXT:    vpmovqd %ymm19, %xmm9
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512-FCP-NEXT:    vpmovqb %zmm20, %xmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512-FCP-NEXT:    vpmovqd %ymm20, %xmm4
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm7
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [8,10,12,14,12,14,14,15]
+; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512-FCP-NEXT:    vpmovqb %zmm19, %xmm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512-FCP-NEXT:    vpsrlq $8, %zmm20, %zmm6
-; AVX512-FCP-NEXT:    vpmovqb %zmm6, %xmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512-FCP-NEXT:    vpsrlq $8, %zmm19, %zmm9
+; AVX512-FCP-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm31
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm30
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm29
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm28
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm27
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm8, %xmm26
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm20, %zmm6
-; AVX512-FCP-NEXT:    vpmovqb %zmm6, %xmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm25
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm24
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm31
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm30
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm29
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm27
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm9, %xmm28
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm10, %xmm25
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm19, %zmm9
+; AVX512-FCP-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm26
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm24
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm1
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512-FCP-NEXT:    vpsrlq $24, %zmm20, %zmm2
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm1
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512-FCP-NEXT:    vpsrlq $24, %zmm19, %zmm2
 ; AVX512-FCP-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT:    vpermd %ymm16, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vpermd %ymm17, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm7, %ymm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpermd %ymm18, %ymm0, %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm14
-; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm0
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [9,11,10,11,9,11,13,15]
+; AVX512-FCP-NEXT:    vpermd %zmm18, %zmm11, %zmm4
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vpermd %zmm17, %zmm11, %zmm11
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512-FCP-NEXT:    vpermd %ymm19, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm20, %zmm15
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [9,11,13,15,13,15,14,15]
+; AVX512-FCP-NEXT:    vpermd %zmm16, %zmm0, %zmm14
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,0,0,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm20, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm15
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3]
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm19, %zmm15
 ; AVX512-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512-FCP-NEXT:    vpsrlq $40, %zmm20, %zmm15
+; AVX512-FCP-NEXT:    vpsrlq $40, %zmm19, %zmm15
 ; AVX512-FCP-NEXT:    vpmovqb %zmm15, %xmm15
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm10
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm15
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm13
 ; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm15
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm15
 ; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm15
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512-FCP-NEXT:    vpsrlq $48, %zmm20, %zmm15
+; AVX512-FCP-NEXT:    vpsrlq $48, %zmm19, %zmm15
 ; AVX512-FCP-NEXT:    vpmovqb %zmm15, %xmm15
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm13
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm13
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm13
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm7, %ymm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm14, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
-; AVX512-FCP-NEXT:    vpsrlq $56, %zmm20, %zmm3
-; AVX512-FCP-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512-FCP-NEXT:    vpsrlq $56, %zmm19, %zmm4
+; AVX512-FCP-NEXT:    vpmovqb %zmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, (%rsi)
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa %ymm12, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %ymm10, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, (%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa %ymm12, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -6514,215 +6538,223 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-LABEL: load_i8_stride8_vf32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm16
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm3
+; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm2
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm4
+; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm14, %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm11
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm11, %xmm6
+; AVX512DQ-NEXT:    vpmovqb %zmm3, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-NEXT:    vpmovqb %ymm4, %xmm4
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-NEXT:    vpmovqb %ymm6, %xmm9
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
+; AVX512DQ-NEXT:    vpmovqb %ymm6, %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm18
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm19
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm7, %xmm8
 ; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
+; AVX512DQ-NEXT:    vpmovqb %zmm16, %xmm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm17
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm9
+; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa 176(%rdi), %xmm10
 ; AVX512DQ-NEXT:    vmovdqa 240(%rdi), %xmm12
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
-; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm7
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm12, %xmm5
+; AVX512DQ-NEXT:    vmovdqa %xmm11, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm2, %xmm11
+; AVX512DQ-NEXT:    vmovdqa %xmm14, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm11 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm10, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm0, %xmm11
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm22
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm11
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm15 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm11, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm9, %xmm15
+; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm20
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm24
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm8
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm9, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm3, %xmm13
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3]
+; AVX512DQ-NEXT:    vpsrlq $8, %zmm16, %zmm5
+; AVX512DQ-NEXT:    vpmovqb %zmm5, %xmm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm12, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm6, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm21
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm13
+; AVX512DQ-NEXT:    vmovdqa %xmm2, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm4, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm9
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm9, %xmm2
-; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm8, %xmm6
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm13 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm10, %xmm15
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm0, %xmm13
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm23
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm15 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm11, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm14, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX512DQ-NEXT:    vpmovqb %ymm2, %xmm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm6
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm18
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm1, %xmm11
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm21
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
-; AVX512DQ-NEXT:    vpmovqb %zmm16, %xmm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %xmm10
-; AVX512DQ-NEXT:    vmovdqa 176(%rdi), %xmm6
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm19
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm7, %xmm12
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm9, %xmm12
-; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm22
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm8, %xmm13
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm20
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm6, %xmm13
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm10, %xmm12
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm13
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm13, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm26
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm24
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm8, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm4, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm3, %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm11
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm9, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm22
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm20
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-NEXT:    vpsrlq $8, %zmm16, %zmm2
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm16, %zmm2
 ; AVX512DQ-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm7
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm7, %xmm14
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm6, %xmm14
-; AVX512DQ-NEXT:    vmovdqa %xmm6, %xmm12
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm14 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm6, %xmm14
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm20
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm16, %zmm2
-; AVX512DQ-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm19
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm25
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm9
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm7, %xmm14
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm22
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm28
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm4, %xmm13
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm21
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm12, %xmm14
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm27
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm14 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm12
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm5 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vmovdqa %xmm10, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm10, %xmm13
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm11, %xmm15
+; AVX512DQ-NEXT:    vmovdqa %xmm14, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5],ymm13[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm29
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm4, %xmm5
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm15
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm15, %xmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm11, %xmm6
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
 ; AVX512DQ-NEXT:    vpsrlq $24, %zmm16, %zmm2
 ; AVX512DQ-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm20
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm9, %xmm7
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm12, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm9, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm24
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm9, %xmm14
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm0, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm9, %xmm13
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm14
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm22
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm13, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm25
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm5 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vmovdqa %xmm6, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm6, %xmm13
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm25
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm13 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm11, %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm10, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm10, %xmm13
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm15, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm14, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm26
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm10
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm23
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm23
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
 ; AVX512DQ-NEXT:    vpsrlq $32, %zmm16, %zmm2
@@ -6730,43 +6762,46 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm11, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm7, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm13
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm12, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm27
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm10, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm8
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm26
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm9, %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm9, %xmm14
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm9, %xmm5
+; AVX512DQ-NEXT:    vmovdqa %xmm9, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm7, %xmm5
+; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm7
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm4 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm9, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm12, %xmm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm11, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm24
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX512DQ-NEXT:    vmovdqa %xmm6, %xmm13
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm8, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm15, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm11, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm24
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
 ; AVX512DQ-NEXT:    vpsrlq $40, %zmm16, %zmm2
@@ -6774,41 +6809,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm11, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm23
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm13, %xmm2
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm10, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm14, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm25
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm15, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm25
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm6, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm9, %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm9, %xmm14
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm12, %xmm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm23
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm4 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm7, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm13, %xmm4
+; AVX512DQ-NEXT:    vmovdqa %xmm13, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm8, %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm8, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm15, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm11, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm13
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm5, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm11
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm10
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm8
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
 ; AVX512DQ-NEXT:    vpsrlq $48, %zmm16, %zmm2
@@ -6816,12 +6851,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm2
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm13, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm4
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm5
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
@@ -6829,22 +6864,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm6, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm14, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm8
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm14, %xmm7
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm12, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm15, %xmm5
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm13, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
 ; AVX512DQ-NEXT:    vpsrlq $56, %zmm16, %zmm3
@@ -6867,184 +6903,188 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-FCP-LABEL: load_i8_stride8_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm20
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm18
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm16
-; AVX512DQ-FCP-NEXT:    vpermd %ymm16, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm18
-; AVX512DQ-FCP-NEXT:    vpermd %ymm18, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm4, %zmm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpmovqd %ymm12, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm19
-; AVX512DQ-FCP-NEXT:    vpmovqd %ymm19, %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm20, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512DQ-FCP-NEXT:    vpmovqd %ymm20, %xmm4
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [8,10,12,14,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm16, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm19, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $8, %zmm20, %zmm6
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $8, %zmm19, %zmm9
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm31
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm30
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm29
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm28
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm27
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm8, %xmm26
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm20, %zmm6
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm25
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm24
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm31
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm30
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm29
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm27
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm9, %xmm28
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm10, %xmm25
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm19, %zmm9
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm26
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm24
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm1
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $24, %zmm20, %zmm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm1
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $24, %zmm19, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm16, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermd %ymm17, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm7, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm18, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm14
-; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm18, %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm11, %zmm11
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm19, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm20, %zmm15
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm16, %zmm0, %zmm14
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm20, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm15
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm19, %zmm15
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $40, %zmm20, %zmm15
+; AVX512DQ-FCP-NEXT:    vpsrlq $40, %zmm19, %zmm15
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm15, %xmm15
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm15
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm13
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm15
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm15
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm20, %zmm15
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm19, %zmm15
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm15, %xmm15
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm14, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $56, %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $56, %zmm19, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, (%r9)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
@@ -7056,291 +7096,296 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
-; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa 240(%rdi), %xmm2
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm1
-; AVX512BW-NEXT:    vmovdqa 224(%rdi), %xmm6
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm6, %xmm3
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vmovdqa 208(%rdi), %xmm7
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa 208(%rdi), %xmm1
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm7, %xmm3
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm9
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm9, %xmm8
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX512BW-NEXT:    vpmovqb %ymm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm1, %xmm4
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT:    vmovdqa 224(%rdi), %xmm6
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm6, %xmm7
+; AVX512BW-NEXT:    vpmovqb %zmm3, %xmm3
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm1, %xmm8
-; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
-; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm11
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3]
-; AVX512BW-NEXT:    vpmovqb %zmm16, %xmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512BW-NEXT:    vpmovqb %ymm4, %xmm4
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm7
+; AVX512BW-NEXT:    vpmovqb %ymm7, %xmm9
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512BW-NEXT:    vpmovqb %ymm7, %xmm7
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5],ymm4[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm3, %xmm7
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm7
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm7, %xmm8
+; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
+; AVX512BW-NEXT:    vpmovqb %zmm16, %xmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %ymm0, %ymm20
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %xmm19
-; AVX512BW-NEXT:    vmovdqa 144(%rdi), %xmm11
-; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm12
-; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm13
+; AVX512BW-NEXT:    vmovdqa 144(%rdi), %xmm9
+; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm10
+; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm11
+; AVX512BW-NEXT:    vmovdqa 240(%rdi), %xmm12
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm14
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm13
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm1, %xmm14
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm2, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm9, %xmm17
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-NEXT:    vpshufb %xmm14, %xmm11, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm14, %xmm10, %xmm14
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    vpshufb %xmm15, %xmm13, %xmm17
-; AVX512BW-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm9, %xmm17
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm15
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
-; AVX512BW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm10
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm4, %xmm10
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm14
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vpsrlq $8, %zmm16, %zmm10
-; AVX512BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm14
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm3, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm4, %xmm13
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
+; AVX512BW-NEXT:    vpsrlq $8, %zmm16, %zmm13
+; AVX512BW-NEXT:    vpmovqb %zmm13, %xmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %ymm0, %ymm21
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm13
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm9, %xmm15
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm1, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm2, %xmm17
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    vpshufb %xmm15, %xmm11, %xmm17
-; AVX512BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm10, %xmm15
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vpsrlq $16, %zmm16, %zmm10
-; AVX512BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    vpshufb %xmm17, %xmm9, %xmm18
+; AVX512BW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm3, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm4, %xmm13
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512BW-NEXT:    vpsrlq $16, %zmm16, %zmm8
+; AVX512BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %ymm0, %ymm22
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm9, %xmm15
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    vpshufb %xmm15, %xmm11, %xmm17
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm11, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm10, %xmm13
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm9, %xmm17
 ; AVX512BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm15
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vpsrlq $24, %zmm16, %zmm10
-; AVX512BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512BW-NEXT:    vpsrlq $24, %zmm16, %zmm8
+; AVX512BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %ymm0, %ymm23
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm9, %xmm17
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm13, %xmm17
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm17
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm17[0],xmm13[0],xmm17[1],xmm13[1],xmm17[2],xmm13[2],xmm17[3],xmm13[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm11, %xmm17
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm10, %xmm13
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
+; AVX512BW-NEXT:    vpshufb %xmm17, %xmm9, %xmm18
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
 ; AVX512BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vpsrlq $32, %zmm16, %zmm10
-; AVX512BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512BW-NEXT:    vpsrlq $32, %zmm16, %zmm8
+; AVX512BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %ymm0, %ymm24
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm7, %xmm15
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm9, %xmm17
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm17
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    vpshufb %xmm15, %xmm13, %xmm17
-; AVX512BW-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm11, %xmm17
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm10, %xmm15
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
+; AVX512BW-NEXT:    vpshufb %xmm17, %xmm9, %xmm18
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vpsrlq $40, %zmm16, %zmm10
-; AVX512BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512BW-NEXT:    vpsrlq $40, %zmm16, %zmm8
+; AVX512BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %ymm0, %ymm25
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm10
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm14
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm9, %xmm17
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm13
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm1, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm2, %xmm17
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5,6],ymm8[7]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    vpshufb %xmm15, %xmm13, %xmm17
-; AVX512BW-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm11, %xmm17
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm10, %xmm15
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
+; AVX512BW-NEXT:    vpshufb %xmm17, %xmm9, %xmm18
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
 ; AVX512BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm5
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm15[5],ymm5[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm10
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm4, %xmm10
-; AVX512BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm14
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vpsrlq $48, %zmm16, %zmm10
-; AVX512BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm3, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm13, %xmm4, %xmm13
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512BW-NEXT:    vpsrlq $48, %zmm16, %zmm8
+; AVX512BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm12, %xmm8
 ; AVX512BW-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm11, %xmm6
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
 ; AVX512BW-NEXT:    vpshufb %xmm6, %xmm9, %xmm9
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm13, %xmm9
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm12, %xmm7
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
-; AVX512BW-NEXT:    vpshufb %xmm9, %xmm19, %xmm9
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-NEXT:    vpshufb %xmm6, %xmm4, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm6, %xmm8, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm19, %xmm6
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5],ymm6[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm14, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
 ; AVX512BW-NEXT:    vpsrlq $56, %zmm16, %zmm3
 ; AVX512BW-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %ymm20, (%rsi)
 ; AVX512BW-NEXT:    vmovdqa64 %ymm21, (%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %ymm22, (%rcx)
@@ -7354,167 +7399,172 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-FCP-LABEL: load_i8_stride8_vf32:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512BW-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
-; AVX512BW-FCP-NEXT:    vpermd %ymm5, %ymm0, %ymm7
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512BW-FCP-NEXT:    vpermd %ymm8, %ymm0, %ymm30
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm11
-; AVX512BW-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm3
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm15
-; AVX512BW-FCP-NEXT:    vpermd %ymm15, %ymm0, %ymm2
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa 96(%rdi), %ymm12
-; AVX512BW-FCP-NEXT:    vpmovqd %ymm12, %xmm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm16
-; AVX512BW-FCP-NEXT:    vpmovqd %ymm16, %xmm0
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm4, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm31, %ymm3, %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm12
+; AVX512BW-FCP-NEXT:    vpermd %ymm12, %ymm0, %ymm8
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,10,10,11,8,10,12,14]
+; AVX512BW-FCP-NEXT:    vpermd %zmm5, %zmm3, %zmm30
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm14
+; AVX512BW-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm6
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpermd %zmm4, %zmm3, %zmm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4],ymm3[5],ymm9[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512BW-FCP-NEXT:    vpmovqd %ymm10, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,10,12,14,12,14,14,15]
+; AVX512BW-FCP-NEXT:    vpermd %zmm7, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm31, %xmm3, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm2, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7]
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm18, %ymm6, %ymm11
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm2, %ymm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm0, %ymm13
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm1, %xmm9
+; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm1, %xmm11
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm0, %xmm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $8, %zmm4, %zmm10
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %ymm6, %ymm18
+; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm3, %xmm13
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $8, %zmm2, %zmm13
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %ymm9, %ymm17
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm22, %ymm7, %ymm6
+; AVX512BW-FCP-NEXT:    vpshufb %ymm22, %ymm8, %ymm9
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm30, %ymm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm30, %ymm13
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm3, %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm6, %ymm13
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm25, %ymm2, %ymm13
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm25, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm1, %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm1, %xmm13
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm0, %xmm13
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $16, %zmm4, %zmm13
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm13, %xmm13
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm3, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $16, %zmm2, %zmm15
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm7, %ymm7
+; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm8, %ymm8
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm30, %ymm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm30, %ymm13
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm30, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm17, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm30, %ymm6, %ymm6
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm13[6,7]
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm16 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm16, %xmm1, %xmm13
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm4, %zmm10
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512BW-FCP-NEXT:    vpermd %ymm5, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX512BW-FCP-NEXT:    vpermd %ymm8, %ymm0, %ymm8
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT:    vpermd %ymm15, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm2, %zmm13
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
+; AVX512BW-FCP-NEXT:    vpermd %ymm12, %ymm3, %ymm12
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [9,11,10,11,9,11,13,15]
+; AVX512BW-FCP-NEXT:    vpermd %zmm5, %zmm15, %zmm5
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
+; AVX512BW-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm13
+; AVX512BW-FCP-NEXT:    vpermd %zmm4, %zmm15, %zmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [9,11,13,15,13,15,14,15]
+; AVX512BW-FCP-NEXT:    vpermd %zmm7, %zmm4, %zmm7
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,3,5,7,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermd %ymm10, %ymm4, %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %xmm31, %xmm7, %xmm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $32, %zmm2, %zmm11
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm11, %xmm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm18, %ymm13, %ymm11
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm14, %ymm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm7, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm10, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $40, %zmm2, %zmm15
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm22, %ymm12, %ymm11
+; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm5, %ymm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm15[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm13, %ymm15
+; AVX512BW-FCP-NEXT:    vpshufb %ymm25, %ymm14, %ymm0
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7]
-; AVX512BW-FCP-NEXT:    vpermd %ymm12, %ymm5, %ymm12
-; AVX512BW-FCP-NEXT:    vpermd %ymm16, %ymm5, %ymm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $32, %zmm4, %zmm14
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm31, %ymm11, %ymm14
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm13, %ymm9
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm12, %xmm9
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm15, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $40, %zmm4, %zmm14
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm22, %ymm10, %ymm9
-; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm8, %ymm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm11, %ymm14
-; AVX512BW-FCP-NEXT:    vpshufb %ymm25, %ymm13, %ymm7
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm12, %xmm9
-; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm15, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $48, %zmm4, %zmm14
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm10, %ymm9
-; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm8, %ymm8
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm30, %ymm11, %ymm9
-; AVX512BW-FCP-NEXT:    vpshufb %ymm17, %ymm13, %ymm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $56, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm7, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm10, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $48, %zmm2, %zmm15
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm12, %ymm11
+; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm5, %ymm5
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm30, %ymm13, %ymm11
+; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm14, %ymm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb %xmm16, %xmm10, %xmm7
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $56, %zmm2, %zmm2
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %ymm2, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %ymm18, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %ymm17, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm9, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, (%r9)
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -7526,291 +7576,296 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm16
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa 240(%rdi), %xmm2
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa 224(%rdi), %xmm6
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm6, %xmm3
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT:    vmovdqa 208(%rdi), %xmm7
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa 208(%rdi), %xmm1
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm7, %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm9, %xmm8
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX512DQ-BW-NEXT:    vpmovqb %ymm3, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm1, %xmm4
+; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqa 224(%rdi), %xmm6
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm6, %xmm7
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa 112(%rdi), %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm1, %xmm8
-; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
-; AVX512DQ-BW-NEXT:    vmovdqa 80(%rdi), %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm11
-; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3]
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm16, %xmm10
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-BW-NEXT:    vpmovqb %ymm4, %xmm4
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm7
+; AVX512DQ-BW-NEXT:    vpmovqb %ymm7, %xmm9
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpmovqb %ymm7, %xmm7
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5],ymm4[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa 80(%rdi), %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm3, %xmm7
+; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %xmm7
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm7, %xmm8
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm16, %xmm5
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %ymm0, %ymm20
 ; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %xmm19
-; AVX512DQ-BW-NEXT:    vmovdqa 144(%rdi), %xmm11
-; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %xmm12
-; AVX512DQ-BW-NEXT:    vmovdqa 176(%rdi), %xmm13
+; AVX512DQ-BW-NEXT:    vmovdqa 144(%rdi), %xmm9
+; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %xmm10
+; AVX512DQ-BW-NEXT:    vmovdqa 176(%rdi), %xmm11
+; AVX512DQ-BW-NEXT:    vmovdqa 240(%rdi), %xmm12
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm14
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm13
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm1, %xmm14
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm2, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm9, %xmm17
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm11, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm10, %xmm14
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm13, %xmm17
-; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm9, %xmm17
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm15
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
-; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm10
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm10
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm4, %xmm10
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm14
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $8, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa 112(%rdi), %xmm14
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm3, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm4, %xmm13
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $8, %zmm16, %zmm13
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm13, %xmm13
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %ymm0, %ymm21
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm13
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm9, %xmm15
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm1, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm2, %xmm17
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm11, %xmm17
-; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm10, %xmm15
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $16, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm9, %xmm18
+; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm8
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm3, %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm4, %xmm13
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $16, %zmm16, %zmm8
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %ymm0, %ymm22
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm9, %xmm15
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm11, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm10, %xmm13
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm11, %xmm17
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm9, %xmm17
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm15
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $24, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $24, %zmm16, %zmm8
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %ymm0, %ymm23
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm9, %xmm17
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm13, %xmm17
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm17
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm17[0],xmm13[0],xmm17[1],xmm13[1],xmm17[2],xmm13[2],xmm17[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm11, %xmm17
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm10, %xmm13
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
+; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm9, %xmm18
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm16, %zmm8
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %ymm0, %ymm24
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm7, %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm9, %xmm17
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm17
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm13, %xmm17
-; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm11, %xmm17
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm10, %xmm15
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
+; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm9, %xmm18
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm10
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $40, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $40, %zmm16, %zmm8
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %ymm0, %ymm25
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm10
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm14
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm9, %xmm17
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm13
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm1, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm2, %xmm17
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5,6],ymm8[7]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm13, %xmm17
-; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm11, %xmm17
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm10, %xmm15
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
+; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm9, %xmm18
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm5
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm15[5],ymm5[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm10
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm4, %xmm10
-; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm14
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $48, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm3, %xmm8
+; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm4, %xmm13
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $48, %zmm16, %zmm8
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm12, %xmm8
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm11, %xmm6
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm9, %xmm9
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm13, %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm12, %xmm7
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
-; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm19, %xmm9
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm4, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm8, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm19, %xmm6
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5],ymm6[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm14, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
 ; AVX512DQ-BW-NEXT:    vpsrlq $56, %zmm16, %zmm3
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %ymm20, (%rsi)
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %ymm21, (%rdx)
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %ymm22, (%rcx)
@@ -7824,167 +7879,172 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf32:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm5, %ymm0, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm8, %ymm0, %ymm30
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm0, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 96(%rdi), %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm12, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm16, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm4, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm31, %ymm3, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm12, %ymm0, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm5, %zmm3, %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm4, %zmm3, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4],ymm3[5],ymm9[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm10, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,10,12,14,12,14,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm7, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm31, %xmm3, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm2, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm18, %ymm6, %ymm11
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm2, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm0, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm1, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm1, %xmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm0, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $8, %zmm4, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %ymm6, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm3, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $8, %zmm2, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %ymm9, %ymm17
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm22, %ymm7, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm22, %ymm8, %ymm9
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm30, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm30, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm3, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm6, %ymm13
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm25, %ymm2, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm25, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm1, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm1, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm0, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %zmm4, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm3, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %zmm2, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm7, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm8, %ymm8
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm30, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm30, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm30, %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm17, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm30, %ymm6, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm13[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm16 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm16, %xmm1, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm4, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm5, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm8, %ymm0, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm2, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm12, %ymm3, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm5, %zmm15, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm4, %zmm15, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm7, %zmm4, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,3,5,7,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm10, %ymm4, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm31, %xmm7, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %zmm2, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm18, %ymm13, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm14, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm7, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm10, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $40, %zmm2, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm22, %ymm12, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm5, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm13, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm25, %ymm14, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm12, %ymm5, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm16, %ymm5, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %zmm4, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm31, %ymm11, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm13, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm12, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm15, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $40, %zmm4, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm22, %ymm10, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm8, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm11, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm25, %ymm13, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm12, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm15, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %zmm4, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm10, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm8, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm30, %ymm11, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm17, %ymm13, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $56, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm7, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm10, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %zmm2, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm12, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm5, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm30, %ymm13, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm14, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm16, %xmm10, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $56, %zmm2, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %ymm18, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %ymm17, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm9, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
@@ -13041,729 +13101,731 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512-LABEL: load_i8_stride8_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $520, %rsp # imm = 0x208
-; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm17
-; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm2
-; AVX512-NEXT:    vmovdqa 496(%rdi), %xmm7
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm1
-; AVX512-NEXT:    vmovdqa 480(%rdi), %xmm8
-; AVX512-NEXT:    vpshufb %xmm0, %xmm8, %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512-NEXT:    vmovdqa 464(%rdi), %xmm11
+; AVX512-NEXT:    subq $408, %rsp # imm = 0x198
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm16
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm3
+; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm30
+; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm4
+; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512-NEXT:    vmovdqa 464(%rdi), %xmm10
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm4
-; AVX512-NEXT:    vmovdqa 448(%rdi), %xmm15
-; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm4
-; AVX512-NEXT:    vpmovqb %ymm4, %xmm4
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vmovdqa 368(%rdi), %xmm4
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm18
-; AVX512-NEXT:    vmovdqa 352(%rdi), %xmm14
-; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-NEXT:    vmovdqa 336(%rdi), %xmm4
-; AVX512-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
-; AVX512-NEXT:    vmovdqa 320(%rdi), %xmm6
-; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm5
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm19
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX512-NEXT:    vpmovqb %zmm17, %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
+; AVX512-NEXT:    vmovdqa 448(%rdi), %xmm11
+; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm6
+; AVX512-NEXT:    vmovdqa 480(%rdi), %xmm8
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm7
+; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm19
+; AVX512-NEXT:    vpmovqb %zmm5, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm6
+; AVX512-NEXT:    vpmovqb %ymm6, %xmm6
+; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT:    vmovdqa 416(%rdi), %ymm7
+; AVX512-NEXT:    vpmovqb %ymm7, %xmm8
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512-NEXT:    vpmovqb %ymm7, %xmm7
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-NEXT:    vmovdqa 336(%rdi), %xmm7
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm6
+; AVX512-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm17
+; AVX512-NEXT:    vmovdqa 320(%rdi), %xmm8
+; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm7
+; AVX512-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm22
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512-NEXT:    vmovdqa 352(%rdi), %xmm13
+; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm7
+; AVX512-NEXT:    vpmovqb %zmm4, %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
+; AVX512-NEXT:    vpmovqb %zmm30, %xmm6
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm4, %zmm0, %zmm4
 ; AVX512-NEXT:    movb $-64, %al
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm4
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm25
-; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm4
-; AVX512-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm5
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm29
-; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm10
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm5
-; AVX512-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm4 {%k1}
+; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm15
+; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm5
+; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm6
+; AVX512-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm12
+; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm6
+; AVX512-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpmovqb %zmm3, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm5
 ; AVX512-NEXT:    vpmovqb %ymm5, %xmm5
 ; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm4
-; AVX512-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm5
-; AVX512-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512-NEXT:    vpmovqb %ymm6, %xmm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
+; AVX512-NEXT:    vpmovqb %ymm6, %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm5
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm30
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm28
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX512-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm6
+; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm20
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm5
+; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm21
+; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm16
 ; AVX512-NEXT:    vpmovqb %zmm16, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm7, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm24
-; AVX512-NEXT:    vpshufb %xmm9, %xmm8, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm21
+; AVX512-NEXT:    vmovdqa 496(%rdi), %xmm1
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm25
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm9
+; AVX512-NEXT:    vpshufb %xmm8, %xmm9, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm11, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm27
-; AVX512-NEXT:    vpshufb %xmm8, %xmm15, %xmm4
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm27
+; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm4
+; AVX512-NEXT:    vmovdqa %xmm11, %xmm10
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT:    vmovdqa 416(%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa 432(%rdi), %xmm11
+; AVX512-NEXT:    vmovdqa 416(%rdi), %xmm14
+; AVX512-NEXT:    vmovdqa 432(%rdi), %xmm5
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
-; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm5
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm22
+; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm23
+; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm5
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
 ; AVX512-NEXT:    vmovdqa 384(%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa 400(%rdi), %xmm12
+; AVX512-NEXT:    vmovdqa 400(%rdi), %xmm11
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm6
+; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm6
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm7
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm31
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm26
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm0
-; AVX512-NEXT:    vpshufb %xmm9, %xmm0, %xmm5
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm23
-; AVX512-NEXT:    vpshufb %xmm9, %xmm14, %xmm6
-; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm26
+; AVX512-NEXT:    vmovdqa 368(%rdi), %xmm0
+; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm24
+; AVX512-NEXT:    vpshufb %xmm8, %xmm13, %xmm6
+; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm28
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm6
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm14
-; AVX512-NEXT:    vpshufb %xmm8, %xmm14, %xmm7
+; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm0
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm6
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm0
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512-NEXT:    vpsrlq $8, %zmm17, %zmm6
+; AVX512-NEXT:    vpsrlq $8, %zmm30, %zmm6
 ; AVX512-NEXT:    vpmovqb %zmm6, %xmm6
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm19
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm19 {%k1}
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm0
-; AVX512-NEXT:    vpshufb %xmm9, %xmm0, %xmm4
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm9, %xmm0, %xmm5
+; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm13
+; AVX512-NEXT:    vpshufb %xmm8, %xmm13, %xmm4
+; AVX512-NEXT:    vpshufb %xmm8, %xmm12, %xmm5
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm0
-; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm5
-; AVX512-NEXT:    vpshufb %xmm8, %xmm10, %xmm6
+; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm31
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm6
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT:    vmovdqa 160(%rdi), %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vmovdqa 160(%rdi), %xmm12
 ; AVX512-NEXT:    vmovdqa 176(%rdi), %xmm5
-; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm25
-; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm20
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm4
-; AVX512-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
+; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm5
 ; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm29
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm7
-; AVX512-NEXT:    vpshufb %xmm8, %xmm7, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm3
-; AVX512-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm7
+; AVX512-NEXT:    vpshufb %xmm8, %xmm7, %xmm3
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm6
+; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
 ; AVX512-NEXT:    vpsrlq $8, %zmm16, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm22
 ; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm13
-; AVX512-NEXT:    vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm4, %xmm13, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm10
-; AVX512-NEXT:    vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm4, %xmm10, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm5
+; AVX512-NEXT:    vpshufb %xmm4, %xmm5, %xmm1
+; AVX512-NEXT:    vpshufb %xmm4, %xmm9, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm25
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
-; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm18
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm9
+; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm3
+; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm19
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm24
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm11
-; AVX512-NEXT:    vpshufb %xmm3, %xmm11, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm10
+; AVX512-NEXT:    vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm2
+; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm15
+; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm20
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm30
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm5
-; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm17
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm11
+; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm14
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm12
-; AVX512-NEXT:    vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm4, %xmm12, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm5
-; AVX512-NEXT:    vpshufb %xmm4, %xmm5, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm9
-; AVX512-NEXT:    vpshufb %xmm1, %xmm14, %xmm15
-; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm21
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT:    vpsrlq $16, %zmm17, %zmm9
-; AVX512-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm11
+; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm14
+; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm6
-; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
+; AVX512-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm23
+; AVX512-NEXT:    vpsrlq $16, %zmm30, %zmm14
+; AVX512-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
+; AVX512-NEXT:    vpshufb %xmm4, %xmm13, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm30
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm4, %xmm13, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm13
+; AVX512-NEXT:    vpshufb %xmm1, %xmm13, %xmm14
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm1, %xmm13, %xmm15
+; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm16
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm9
-; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm15
-; AVX512-NEXT:    vpshufb %xmm3, %xmm15, %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm9
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm14
+; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm24
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm12
+; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm27
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm29
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm7
-; AVX512-NEXT:    vmovdqa64 %xmm28, (%rsp) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm21
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX512-NEXT:    vpsrlq $16, %zmm16, %zmm2
+; AVX512-NEXT:    vpsrlq $16, %zmm22, %zmm2
 ; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
-; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm2
+; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm18
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm6
+; AVX512-NEXT:    vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm10
-; AVX512-NEXT:    vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm4
-; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
+; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm25
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm9
+; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm3
-; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm28
+; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm5
+; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm9
-; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm11
-; AVX512-NEXT:    vpshufb %xmm3, %xmm11, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm13
+; AVX512-NEXT:    vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm12
+; AVX512-NEXT:    vmovdqa64 %xmm26, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm13
-; AVX512-NEXT:    vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm5
+; AVX512-NEXT:    vpshufb %xmm0, %xmm11, %xmm8
+; AVX512-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm10
+; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm14
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT:    vpsrlq $24, %zmm17, %zmm9
-; AVX512-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512-NEXT:    vpsrlq $24, %zmm23, %zmm14
+; AVX512-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm4
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm22
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm26
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm5
+; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm14
-; AVX512-NEXT:    vpshufb %xmm2, %xmm14, %xmm9
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm3, %xmm6, %xmm9
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm5
+; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm5
+; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm3
-; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm7
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT:    vpsrlq $24, %zmm16, %zmm1
+; AVX512-NEXT:    vpsrlq $24, %zmm22, %zmm1
 ; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm1
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm10
-; AVX512-NEXT:    vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm3
+; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm28
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm11
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm7
-; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm8
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm9
+; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm8
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm7
-; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm12
+; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm14
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm8
-; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm9
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT:    vpsrlq $32, %zmm17, %zmm9
-; AVX512-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm11, %xmm8
+; AVX512-NEXT:    vmovdqa %xmm10, %xmm5
+; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm14
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512-NEXT:    vpsrlq $32, %zmm23, %zmm14
+; AVX512-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm5
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm5
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm9
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm4
+; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm12
+; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm13
+; AVX512-NEXT:    vpshufb %xmm1, %xmm13, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm14, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm20
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm9
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm13
+; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm13
+; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm3
-; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm25
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm3
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm4 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm17
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT:    vpsrlq $32, %zmm16, %zmm1
-; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm18
+; AVX512-NEXT:    vpsrlq $32, %zmm22, %zmm1
 ; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm7
+; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm1
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm3
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm4
+; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm27
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm5
-; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm31
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm10
-; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm9
-; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm14
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm9 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm11
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm0, %xmm8, %xmm8
+; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm29
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm18
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT:    vpsrlq $40, %zmm17, %zmm9
-; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm23
-; AVX512-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm21
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512-NEXT:    vpsrlq $40, %zmm23, %zmm14
+; AVX512-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm30
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm6
-; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm13
-; AVX512-NEXT:    vpshufb %xmm1, %xmm13, %xmm9
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm11
+; AVX512-NEXT:    vpshufb %xmm0, %xmm11, %xmm4
+; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm30
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm26
+; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm9
-; AVX512-NEXT:    vpshufb %xmm2, %xmm9, %xmm9
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm12
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm3, %xmm15, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm28
-; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm16
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm14
+; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm20
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm24
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm4
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm10
+; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm7
-; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm4
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT:    vpsrlq $40, %zmm18, %zmm1
-; AVX512-NEXT:    vmovdqa64 %zmm18, %zmm26
+; AVX512-NEXT:    vpsrlq $40, %zmm22, %zmm1
 ; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm27
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm29
+; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm19
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm7
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm4
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm3
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm21
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm8
+; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm8
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm18
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm5
-; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm14
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm15 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm3, %xmm15, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm8
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm9
-; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
-; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm22
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT:    vpsrlq $48, %zmm17, %zmm9
-; AVX512-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm0, %xmm8, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm14
+; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm14
+; AVX512-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm15
+; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512-NEXT:    vpsrlq $48, %zmm23, %zmm14
+; AVX512-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm4
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm13, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm19
-; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm15
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm17
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm11, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm25
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm11
+; AVX512-NEXT:    vpshufb %xmm0, %xmm11, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm14
+; AVX512-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
+; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm6
-; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm9
+; AVX512-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm30
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm20
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm14
-; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm13
-; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm14
+; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm24
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm5
+; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm15
-; AVX512-NEXT:    vpshufb %xmm0, %xmm15, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm12
-; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm20
+; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm16
-; AVX512-NEXT:    vmovdqa (%rsp), %xmm7 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm9
+; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT:    vpsrlq $48, %zmm26, %zmm1
+; AVX512-NEXT:    vpsrlq $48, %zmm22, %zmm1
 ; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm24
+; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm17
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm1
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm4
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm3
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm8
-; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm7
+; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm8
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm9
-; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm11
-; AVX512-NEXT:    vpshufb %xmm3, %xmm11, %xmm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm13
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm7 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4],ymm8[5],ymm13[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm8
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm5
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT:    vpsrlq $56, %zmm23, %zmm9
-; AVX512-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm7
+; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm12
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm7
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm12
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm7
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm13
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3]
+; AVX512-NEXT:    vpsrlq $56, %zmm23, %zmm12
+; AVX512-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm4
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm5
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm5
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm11, %xmm11
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm7
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm11
+; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm7
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm12
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm5
-; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm5
-; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm7
+; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm6
+; AVX512-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
+; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm15, %xmm3
-; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm3
+; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm3
-; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm3
+; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT:    vpsrlq $56, %zmm26, %zmm1
+; AVX512-NEXT:    vpsrlq $56, %zmm22, %zmm1
 ; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
@@ -13779,1168 +13841,1179 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-NEXT:    vmovaps %zmm1, (%r9)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vmovaps %zmm1, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm27, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm24, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512-NEXT:    addq $520, %rsp # imm = 0x208
+; AVX512-NEXT:    addq $408, %rsp # imm = 0x198
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i8_stride8_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $232, %rsp
-; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm29
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    subq $584, %rsp # imm = 0x248
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm13
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm30
+; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm31
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512-FCP-NEXT:    vmovdqa 480(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vmovdqa 448(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm22
+; AVX512-FCP-NEXT:    vpermd %ymm22, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,10,10,11,8,10,12,14]
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vmovdqa 416(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm22
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vmovdqa 384(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm19
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpmovqd %ymm2, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm23
+; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm0, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm19
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpermd %zmm30, %zmm2, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm29
+; AVX512-FCP-NEXT:    vpmovqd %ymm29, %xmm5
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm17
-; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpmovqd %ymm3, %xmm12
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm12, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm12, %xmm21
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT:    vpmovqb %zmm29, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm17
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [8,10,12,14,12,14,14,15]
+; AVX512-FCP-NEXT:    vpermd %zmm12, %zmm20, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
+; AVX512-FCP-NEXT:    vpmovqb %zmm28, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm12
 ; AVX512-FCP-NEXT:    movb $-64, %al
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm31
-; AVX512-FCP-NEXT:    vpermd %ymm31, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm25
-; AVX512-FCP-NEXT:    vpermd %ymm25, %ymm0, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm24
+; AVX512-FCP-NEXT:    vpermd %ymm24, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512-FCP-NEXT:    vpermd %zmm13, %zmm2, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm26
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm14
-; AVX512-FCP-NEXT:    vpermd %ymm26, %ymm0, %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm9, %ymm0
+; AVX512-FCP-NEXT:    vpermd %ymm26, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm18
+; AVX512-FCP-NEXT:    vpermd %zmm8, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm14
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm27
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm28
-; AVX512-FCP-NEXT:    vpmovqd %ymm27, %xmm8
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm15
-; AVX512-FCP-NEXT:    vpmovqd %ymm28, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm18
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3]
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm30
-; AVX512-FCP-NEXT:    vpmovqb %zmm30, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm7
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm23
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm14, %ymm24
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm27
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512-FCP-NEXT:    vpmovqd %ymm25, %xmm13
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm10
+; AVX512-FCP-NEXT:    vpermd %zmm27, %zmm20, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm20
+; AVX512-FCP-NEXT:    vpmovqb %zmm20, %xmm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm12, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm6
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm8, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm19
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm14, %ymm22
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512-FCP-NEXT:    vpsrlq $8, %zmm29, %zmm14
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm17
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm7 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3]
+; AVX512-FCP-NEXT:    vpsrlq $8, %zmm28, %zmm12
+; AVX512-FCP-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm0, %zmm10
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm12
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm13, %xmm12
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm17
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3]
+; AVX512-FCP-NEXT:    vpsrlq $8, %zmm20, %zmm14
 ; AVX512-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm14
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm14
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm19
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm16
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm9, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm17
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm14
-; AVX512-FCP-NEXT:    vmovdqa %xmm8, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm10
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm10, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512-FCP-NEXT:    vpsrlq $8, %zmm30, %zmm15
-; AVX512-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm18
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm6, %ymm12
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm12
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm29, %zmm14
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm8, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm21
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm10
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm7, %xmm12
+; AVX512-FCP-NEXT:    vmovdqa %xmm14, %xmm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm28, %zmm12
+; AVX512-FCP-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm0, %zmm10
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm12
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm12
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm15
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3]
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm20, %zmm14
 ; AVX512-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm13
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm23
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm10, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm30, %zmm15
-; AVX512-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm21
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm7
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm18
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm6
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm7
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm8
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
-; AVX512-FCP-NEXT:    vpsrlq $24, %zmm29, %zmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm7
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm5 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
+; AVX512-FCP-NEXT:    vpsrlq $24, %zmm28, %zmm8
 ; AVX512-FCP-NEXT:    vpmovqb %zmm8, %xmm8
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm6 {%k1}
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm0, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm15, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX512-FCP-NEXT:    vpsrlq $24, %zmm30, %zmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm0
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512-FCP-NEXT:    vpsrlq $24, %zmm20, %zmm1
 ; AVX512-FCP-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm18
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm1
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm16
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [9,11,10,11,9,11,13,15]
+; AVX512-FCP-NEXT:    vpermd %ymm22, %ymm6, %ymm15
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm19
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm22
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7]
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm2
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm15, %ymm2
+; AVX512-FCP-NEXT:    vpermd %zmm31, %zmm1, %zmm4
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm6, %ymm4
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm21
+; AVX512-FCP-NEXT:    vpermd %zmm30, %zmm1, %zmm7
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [9,11,13,15,13,15,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [1,3,5,7,5,7,6,7]
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpermd %ymm29, %ymm23, %ymm10
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm7, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm29, %zmm3
-; AVX512-FCP-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm17
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1}
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512-FCP-NEXT:    vpermd %ymm31, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm10, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm22
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm28, %zmm4
+; AVX512-FCP-NEXT:    vpmovqb %zmm4, %xmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm19
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm19 {%k1}
+; AVX512-FCP-NEXT:    vpermd %ymm24, %ymm6, %ymm4
 ; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT:    vpermd %ymm25, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
-; AVX512-FCP-NEXT:    vpermd %ymm26, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm27, %ymm16, %ymm13
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm0
-; AVX512-FCP-NEXT:    vpermd %ymm28, %ymm16, %ymm12
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm12, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm30, %zmm15
-; AVX512-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm16
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm5
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT:    vpermd %ymm26, %ymm6, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpermd %zmm27, %zmm17, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpermd %ymm25, %ymm23, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm23
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3]
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm20, %zmm14
+; AVX512-FCP-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm15, %ymm0
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm6
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm25
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm6
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT:    vpsrlq $40, %zmm29, %zmm15
-; AVX512-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT:    vpsrlq $40, %zmm28, %zmm14
+; AVX512-FCP-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm22
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm11
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm6, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm23
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm11
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm12, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512-FCP-NEXT:    vpsrlq $40, %zmm30, %zmm15
-; AVX512-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm19
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm26
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm2
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm24
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm14
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT:    vpsrlq $48, %zmm29, %zmm14
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm14
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm13, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm8
+; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm12
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm8
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512-FCP-NEXT:    vpsrlq $40, %zmm20, %zmm14
 ; AVX512-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm14
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm9
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm21
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm15, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm15, %ymm25
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm6
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm26
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT:    vpsrlq $48, %zmm28, %zmm12
+; AVX512-FCP-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm12
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm12
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm12
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512-FCP-NEXT:    vpsrlq $48, %zmm30, %zmm15
-; AVX512-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm17
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512-FCP-NEXT:    vpsrlq $48, %zmm20, %zmm14
+; AVX512-FCP-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm17
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm6
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm11
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm10
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm9
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm10
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm9
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm8
 ; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
-; AVX512-FCP-NEXT:    vpsrlq $56, %zmm29, %zmm8
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
+; AVX512-FCP-NEXT:    vpsrlq $56, %zmm28, %zmm8
 ; AVX512-FCP-NEXT:    vpmovqb %zmm8, %xmm8
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm0, %zmm7
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm7 {%k1}
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm6
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm1
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
-; AVX512-FCP-NEXT:    vpsrlq $56, %zmm30, %zmm3
-; AVX512-FCP-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm3
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512-FCP-NEXT:    vpsrlq $56, %zmm20, %zmm2
+; AVX512-FCP-NEXT:    vpmovqb %zmm2, %xmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm2, (%rsi)
 ; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovaps %zmm2, (%rdx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, (%r8)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, (%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%r9)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512-FCP-NEXT:    addq $232, %rsp
+; AVX512-FCP-NEXT:    addq $584, %rsp # imm = 0x248
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i8_stride8_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $520, %rsp # imm = 0x208
-; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm17
-; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm0
-; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm2
-; AVX512DQ-NEXT:    vmovdqa 496(%rdi), %xmm7
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm1
-; AVX512DQ-NEXT:    vmovdqa 480(%rdi), %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm8, %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vmovdqa 464(%rdi), %xmm11
+; AVX512DQ-NEXT:    subq $408, %rsp # imm = 0x198
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm16
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm30
+; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512DQ-NEXT:    vmovdqa 464(%rdi), %xmm10
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm4
-; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %xmm15
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm4
-; AVX512DQ-NEXT:    vpmovqb %ymm4, %xmm4
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vmovdqa 368(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm18
-; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %xmm14
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa 336(%rdi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm19
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX512DQ-NEXT:    vpmovqb %zmm17, %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
+; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vmovdqa 480(%rdi), %xmm8
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm19
+; AVX512DQ-NEXT:    vpmovqb %zmm5, %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %ymm6
+; AVX512DQ-NEXT:    vpmovqb %ymm6, %xmm6
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %ymm7
+; AVX512DQ-NEXT:    vpmovqb %ymm7, %xmm8
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512DQ-NEXT:    vpmovqb %ymm7, %xmm7
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-NEXT:    vmovdqa 336(%rdi), %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm6
+; AVX512DQ-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm17
+; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %xmm8
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm8, %xmm7
+; AVX512DQ-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm22
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %xmm13
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm13, %xmm7
+; AVX512DQ-NEXT:    vpmovqb %zmm4, %xmm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
+; AVX512DQ-NEXT:    vpmovqb %zmm30, %xmm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm4, %zmm0, %zmm4
 ; AVX512DQ-NEXT:    movb $-64, %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 240(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm25
-; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm29
-; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm10
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm5
-; AVX512DQ-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm4 {%k1}
+; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm15
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm5
+; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm6
+; AVX512DQ-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm6
+; AVX512DQ-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpmovqb %zmm3, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm5
 ; AVX512DQ-NEXT:    vpmovqb %ymm5, %xmm5
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm5
-; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-NEXT:    vpmovqb %ymm6, %xmm7
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
+; AVX512DQ-NEXT:    vpmovqb %ymm6, %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm30
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm28
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm20
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm21
+; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm16
 ; AVX512DQ-NEXT:    vpmovqb %zmm16, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm7, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm24
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm8, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm21
+; AVX512DQ-NEXT:    vmovdqa 496(%rdi), %xmm1
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm25
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm9, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm11, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm27
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm15, %xmm4
+; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm27
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm4
+; AVX512DQ-NEXT:    vmovdqa %xmm11, %xmm10
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 432(%rdi), %xmm11
+; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %xmm14
+; AVX512DQ-NEXT:    vmovdqa 432(%rdi), %xmm5
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm22
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm23
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm5
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
 ; AVX512DQ-NEXT:    vmovdqa 384(%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 400(%rdi), %xmm12
+; AVX512DQ-NEXT:    vmovdqa 400(%rdi), %xmm11
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm6
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm0, %xmm7
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm31
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm26
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm0, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm23
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm14, %xmm6
-; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm26
+; AVX512DQ-NEXT:    vmovdqa 368(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm0, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm24
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm13, %xmm6
+; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm28
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm0, %xmm6
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm14
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm14, %xmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm6
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512DQ-NEXT:    vpsrlq $8, %zmm17, %zmm6
+; AVX512DQ-NEXT:    vpsrlq $8, %zmm30, %zmm6
 ; AVX512DQ-NEXT:    vpmovqb %zmm6, %xmm6
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm19
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm19 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm0, %xmm4
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm0, %xmm5
+; AVX512DQ-NEXT:    vmovdqa 240(%rdi), %xmm13
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm13, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm12, %xmm5
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm0, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm10, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm31
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm6
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %xmm12
 ; AVX512DQ-NEXT:    vmovdqa 176(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm25
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm20
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm5
 ; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm29
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm7
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm7, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm7, %xmm3
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
 ; AVX512DQ-NEXT:    vpsrlq $8, %zmm16, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm22
 ; AVX512DQ-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm13
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm13, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm10
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm10, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm5, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm9, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm25
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm18
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm19
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm24
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm10
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm10, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm15
+; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm20
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm30
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm17
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm14
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm12
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm12, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm5, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm14, %xmm15
-; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm21
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm17, %zmm9
-; AVX512DQ-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm14, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm23
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm30, %zmm14
+; AVX512DQ-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm13, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm30
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm13, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm13
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm13, %xmm14
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm13, %xmm15
+; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm16
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm15
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm15, %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm9, %xmm9
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm24
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm7, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm27
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm29
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm7
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, (%rsp) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm21
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm16, %zmm2
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm22, %zmm2
 ; AVX512DQ-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm18
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm6
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm10
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm25
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm4
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm28
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm13
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm12
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm12, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm13
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm14
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT:    vpsrlq $24, %zmm17, %zmm9
-; AVX512DQ-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512DQ-NEXT:    vpsrlq $24, %zmm23, %zmm14
+; AVX512DQ-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm22
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm26
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm14
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm14, %xmm9
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm6, %xmm9
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm7
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vpsrlq $24, %zmm16, %zmm1
+; AVX512DQ-NEXT:    vpsrlq $24, %zmm22, %zmm1
 ; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm1
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm10
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm28
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm7
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm7, %xmm8
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm9, %xmm8
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm7
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm7, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm14
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm12, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT:    vpsrlq $32, %zmm17, %zmm9
-; AVX512DQ-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vmovdqa %xmm10, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm14
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512DQ-NEXT:    vpsrlq $32, %zmm23, %zmm14
+; AVX512DQ-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm9
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm12, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm13
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm13, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm14, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm20
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm9
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm13, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm13
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm13
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm25
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm3
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa (%rsp), %xmm4 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm17
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vpsrlq $32, %zmm16, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm18
+; AVX512DQ-NEXT:    vpsrlq $32, %zmm22, %zmm1
 ; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm1
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm3
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm27
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm9, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm31
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm10
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm10, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm12, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm14
+; AVX512DQ-NEXT:    vmovdqa (%rsp), %xmm9 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm11
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm29
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm18
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT:    vpsrlq $40, %zmm17, %zmm9
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm23
-; AVX512DQ-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm21
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512DQ-NEXT:    vpsrlq $40, %zmm23, %zmm14
+; AVX512DQ-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm30
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm13
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm13, %xmm9
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm11, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm30
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm12, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm26
+; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm9, %xmm9
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm12
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm15, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm28
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm14, %xmm16
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm20
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm24
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm7
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
-; AVX512DQ-NEXT:    vmovdqa (%rsp), %xmm4 # 16-byte Reload
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm4
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vpsrlq $40, %zmm18, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm18, %zmm26
+; AVX512DQ-NEXT:    vpsrlq $40, %zmm22, %zmm1
 ; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm27
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm29
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm19
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm4
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm3
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm21
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm8
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm8
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm10, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm18
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm15
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vmovdqa (%rsp), %xmm15 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm15, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm8
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
-; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm22
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT:    vpsrlq $48, %zmm17, %zmm9
-; AVX512DQ-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm15
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512DQ-NEXT:    vpsrlq $48, %zmm23, %zmm14
+; AVX512DQ-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm13, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm19
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm15
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm17
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm11, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm25
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm11, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm9
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm30
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm20
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm14
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm13
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm24
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm15
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm15, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm12
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm20
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm16
-; AVX512DQ-NEXT:    vmovdqa (%rsp), %xmm7 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vpsrlq $48, %zmm26, %zmm1
+; AVX512DQ-NEXT:    vpsrlq $48, %zmm22, %zmm1
 ; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm24
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm17
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm1
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm4
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm3
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm7, %xmm8
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm11
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm7, %xmm13
+; AVX512DQ-NEXT:    vmovdqa (%rsp), %xmm7 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm7, %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4],ymm8[5],ymm13[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm8
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm11
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT:    vpsrlq $56, %zmm23, %zmm9
-; AVX512DQ-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm12
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm12
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm13
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3]
+; AVX512DQ-NEXT:    vpsrlq $56, %zmm23, %zmm12
+; AVX512DQ-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm4
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm11
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm11, %xmm11
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm11
+; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm12
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm15, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT:    vpsrlq $56, %zmm26, %zmm1
+; AVX512DQ-NEXT:    vpsrlq $56, %zmm22, %zmm1
 ; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
@@ -14956,1095 +15029,1107 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm1, (%r9)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovaps %zmm1, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm27, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm24, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-NEXT:    addq $520, %rsp # imm = 0x208
+; AVX512DQ-NEXT:    addq $408, %rsp # imm = 0x198
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i8_stride8_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $232, %rsp
-; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm29
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    subq $584, %rsp # imm = 0x248
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa 480(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa 448(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm22
+; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa 416(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm22
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vmovdqa 384(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm19
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovqd %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm23
+; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm19
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm29
+; AVX512DQ-FCP-NEXT:    vpmovqd %ymm29, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovqd %ymm3, %xmm12
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm12, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm12, %xmm21
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm29, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm17
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [8,10,12,14,12,14,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm12, %zmm20, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm28, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm12
 ; AVX512DQ-FCP-NEXT:    movb $-64, %al
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm31
-; AVX512DQ-FCP-NEXT:    vpermd %ymm31, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm25
-; AVX512DQ-FCP-NEXT:    vpermd %ymm25, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm24
+; AVX512DQ-FCP-NEXT:    vpermd %ymm24, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-FCP-NEXT:    vpermd %zmm13, %zmm2, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm26
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm14
-; AVX512DQ-FCP-NEXT:    vpermd %ymm26, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermd %ymm26, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm18
+; AVX512DQ-FCP-NEXT:    vpermd %zmm8, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm14
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm27
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm28
-; AVX512DQ-FCP-NEXT:    vpmovqd %ymm27, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm15
-; AVX512DQ-FCP-NEXT:    vpmovqd %ymm28, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm18
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm30
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm30, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm14, %ymm24
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512DQ-FCP-NEXT:    vpmovqd %ymm25, %xmm13
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm10
+; AVX512DQ-FCP-NEXT:    vpermd %zmm27, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm20
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm20, %xmm10
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm12, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm6
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm10
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm8, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm19
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm14, %ymm22
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $8, %zmm29, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm17
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $8, %zmm28, %zmm12
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm0, %zmm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm13, %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm13, %xmm17
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $8, %zmm20, %zmm14
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm19
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm16
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm9, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm17
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm10, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $8, %zmm30, %zmm15
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm18
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm6, %ymm12
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm29, %zmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm8, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm21
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm10
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm7, %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm14, %xmm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm28, %zmm12
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm0, %zmm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm20, %zmm14
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm23
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm10, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm30, %zmm15
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm21
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm18
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm6
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm7
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $24, %zmm29, %zmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm7
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm5 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $24, %zmm28, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm15, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $24, %zmm30, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm0
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $24, %zmm20, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm18
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm16
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm6, %ymm15
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm19
-; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm22
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm2
-; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm15, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermd %zmm31, %zmm1, %zmm4
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm6, %ymm4
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm21
+; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [1,3,5,7,5,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermd %ymm29, %ymm23, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm29, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm17
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1}
-; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512DQ-FCP-NEXT:    vpermd %ymm31, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm10, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm22
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm28, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm19
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm19 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermd %ymm24, %ymm6, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm25, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
-; AVX512DQ-FCP-NEXT:    vpermd %ymm26, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm27, %ymm16, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermd %ymm28, %ymm16, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm12, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm30, %zmm15
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm16
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm5
+; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm26, %ymm6, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm27, %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermd %ymm25, %ymm23, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm23
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm20, %zmm14
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm6
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm25
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $40, %zmm29, %zmm15
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $40, %zmm28, %zmm14
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm22
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm6, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm23
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm12, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $40, %zmm30, %zmm15
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm19
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm26
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm24
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm29, %zmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm13, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $40, %zmm20, %zmm14
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm14
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm9
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm21
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm15, %ymm25
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm6
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm28, %zmm12
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm12
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm30, %zmm15
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm20, %zmm14
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm17
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm6
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm11
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm10
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm9
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm10
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm8
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $56, %zmm29, %zmm8
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $56, %zmm28, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm8, %xmm8
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm7 {%k1}
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm1
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpsrlq $56, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vpsrlq $56, %zmm20, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovaps %zmm2, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%r9)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512DQ-FCP-NEXT:    addq $232, %rsp
+; AVX512DQ-FCP-NEXT:    addq $584, %rsp # imm = 0x248
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i8_stride8_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $744, %rsp # imm = 0x2E8
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
-; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512BW-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa64 496(%rdi), %xmm24
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm24, %xmm3
-; AVX512BW-NEXT:    vmovdqa64 480(%rdi), %xmm25
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm25, %xmm4
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vmovdqa64 464(%rdi), %xmm26
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm26, %xmm4
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm30, %xmm6
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vmovdqa 384(%rdi), %ymm4
-; AVX512BW-NEXT:    vpmovqb %ymm4, %xmm4
+; AVX512BW-NEXT:    subq $760, %rsp # imm = 0x2F8
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa 464(%rdi), %xmm2
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    vpshufb %xmm14, %xmm2, %xmm5
+; AVX512BW-NEXT:    vmovdqa 448(%rdi), %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm14, %xmm1, %xmm7
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT:    vmovdqa 480(%rdi), %xmm10
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm10, %xmm7
+; AVX512BW-NEXT:    vpmovqb %zmm4, %xmm4
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512BW-NEXT:    vmovdqa64 368(%rdi), %xmm31
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm31, %xmm4
-; AVX512BW-NEXT:    vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 352(%rdi), %xmm27
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm27, %xmm6
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512BW-NEXT:    vmovdqa64 336(%rdi), %xmm22
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm22, %xmm11
-; AVX512BW-NEXT:    vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa 320(%rdi), %xmm9
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm9, %xmm15
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm10, %zmm0, %zmm20
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512BW-NEXT:    vmovdqa 384(%rdi), %ymm5
+; AVX512BW-NEXT:    vpmovqb %ymm5, %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT:    vmovdqa 416(%rdi), %ymm7
+; AVX512BW-NEXT:    vpmovqb %ymm7, %xmm8
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512BW-NEXT:    vpmovqb %ymm7, %xmm7
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5],ymm5[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-NEXT:    vmovdqa64 336(%rdi), %xmm31
+; AVX512BW-NEXT:    vpshufb %xmm14, %xmm31, %xmm8
+; AVX512BW-NEXT:    vmovdqa64 %xmm31, (%rsp) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %xmm23
+; AVX512BW-NEXT:    vpshufb %xmm14, %xmm23, %xmm11
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX512BW-NEXT:    vmovdqa 352(%rdi), %xmm4
+; AVX512BW-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm4, %xmm13
+; AVX512BW-NEXT:    vpmovqb %zmm6, %xmm6
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3]
+; AVX512BW-NEXT:    vpmovqb %zmm29, %xmm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm16
 ; AVX512BW-NEXT:    movb $-64, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm20 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 240(%rdi), %xmm28
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm28, %xmm7
-; AVX512BW-NEXT:    vmovdqa64 224(%rdi), %xmm18
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm18, %xmm10
-; AVX512BW-NEXT:    vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-NEXT:    vmovdqa64 208(%rdi), %xmm17
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm17, %xmm10
-; AVX512BW-NEXT:    vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm8, %xmm16
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm16 {%k1}
+; AVX512BW-NEXT:    vmovdqa 208(%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm14, %xmm0, %xmm11
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm14, %xmm0, %xmm13
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT:    vmovdqa64 224(%rdi), %xmm30
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm30, %xmm17
+; AVX512BW-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm17[0],xmm9[0],xmm17[1],xmm9[1],xmm17[2],xmm9[2],xmm17[3],xmm9[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512BW-NEXT:    vpmovqb %ymm11, %xmm11
+; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT:    vmovdqa64 160(%rdi), %ymm17
+; AVX512BW-NEXT:    vpmovqb %ymm17, %xmm18
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[2,3,0,1]
+; AVX512BW-NEXT:    vpmovqb %ymm17, %xmm17
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm14, %xmm13, %xmm17
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm14, %xmm0, %xmm14
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
+; AVX512BW-NEXT:    vmovdqa64 96(%rdi), %xmm20
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm20, %xmm15
+; AVX512BW-NEXT:    vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpmovqb %zmm19, %xmm17
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3]
+; AVX512BW-NEXT:    vpmovqb %zmm3, %xmm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm5
+; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm8
 ; AVX512BW-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm16[0],xmm10[0],xmm16[1],xmm10[1],xmm16[2],xmm10[2],xmm16[3],xmm10[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm10
-; AVX512BW-NEXT:    vpmovqb %ymm10, %xmm10
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7]
-; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm10
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm10, %xmm16
-; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm14
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm14, %xmm12
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3]
-; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 240(%rdi), %xmm9
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %xmm17
+; AVX512BW-NEXT:    vmovdqa64 400(%rdi), %xmm26
+; AVX512BW-NEXT:    vmovdqa 416(%rdi), %xmm6
+; AVX512BW-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 432(%rdi), %xmm19
+; AVX512BW-NEXT:    vmovdqa64 496(%rdi), %xmm21
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm21, %xmm12
+; AVX512BW-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm10, %xmm22
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm22[0],xmm12[0],xmm22[1],xmm12[1],xmm22[2],xmm12[2],xmm22[3],xmm12[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm28 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    vmovdqa64 %xmm2, %xmm16
 ; AVX512BW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm2, %xmm21
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm28, %xmm2, %xmm22
+; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm28, %xmm1, %xmm24
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm25 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    vpshufb %xmm25, %xmm19, %xmm22
+; AVX512BW-NEXT:    vpshufb %xmm25, %xmm6, %xmm24
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    vpshufb %xmm27, %xmm26, %xmm22
+; AVX512BW-NEXT:    vpshufb %xmm27, %xmm17, %xmm24
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5],ymm1[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7]
+; AVX512BW-NEXT:    vmovdqa 368(%rdi), %xmm7
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm12
+; AVX512BW-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512BW-NEXT:    vpshufb %xmm28, %xmm31, %xmm15
+; AVX512BW-NEXT:    vmovdqa64 %xmm23, %xmm6
+; AVX512BW-NEXT:    vpshufb %xmm28, %xmm23, %xmm22
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm22[0],xmm15[0],xmm22[1],xmm15[1],xmm22[2],xmm15[2],xmm22[3],xmm15[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
+; AVX512BW-NEXT:    vpsrlq $8, %zmm29, %zmm15
+; AVX512BW-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm12, %zmm0, %zmm12
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm28, %xmm18, %xmm15
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm28, %xmm23, %xmm22
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm22[0],xmm15[0],xmm22[1],xmm15[1],xmm22[2],xmm15[2],xmm22[3],xmm15[3]
+; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm2
 ; AVX512BW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm2, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vpmovqb %zmm5, %xmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm6
-; AVX512BW-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa 384(%rdi), %xmm7
-; AVX512BW-NEXT:    vmovdqa64 400(%rdi), %xmm21
-; AVX512BW-NEXT:    vmovdqa64 416(%rdi), %xmm23
-; AVX512BW-NEXT:    vmovdqa64 432(%rdi), %xmm29
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    vmovdqa64 %xmm24, %xmm19
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm24, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 %xmm25, %xmm11
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm25, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    vmovdqa64 %xmm26, %xmm12
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm26, %xmm24
-; AVX512BW-NEXT:    vmovdqa64 %xmm30, %xmm16
-; AVX512BW-NEXT:    vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm30, %xmm25
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm0, %ymm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    vpshufb %xmm26, %xmm29, %xmm24
-; AVX512BW-NEXT:    vpshufb %xmm26, %xmm23, %xmm25
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm0, %ymm2
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    vpshufb %xmm30, %xmm21, %xmm24
-; AVX512BW-NEXT:    vpshufb %xmm30, %xmm7, %xmm25
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm0, %ymm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT:    vpshufb %xmm25, %xmm2, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm25, %xmm8, %xmm25
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm25[0],xmm15[0],xmm25[1],xmm15[1],xmm25[2],xmm15[2],xmm25[3],xmm15[3]
+; AVX512BW-NEXT:    vmovdqa64 144(%rdi), %xmm22
+; AVX512BW-NEXT:    vpshufb %xmm27, %xmm22, %xmm31
+; AVX512BW-NEXT:    vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm27, %xmm5, %xmm27
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm27 = xmm27[0],xmm31[0],xmm27[1],xmm31[1],xmm27[2],xmm31[2],xmm27[3],xmm31[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm31, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm27, %xmm3
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm22, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm9, %xmm24
-; AVX512BW-NEXT:    vmovdqa64 %xmm9, %xmm31
-; AVX512BW-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512BW-NEXT:    vpsrlq $8, %zmm15, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm22
-; AVX512BW-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %xmm28, %xmm9
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm28, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm18, %xmm3
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm17, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm8, %xmm24
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
-; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT:    vpshufb %xmm26, %xmm0, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm26, %xmm6, %xmm25
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm25[0],xmm3[0],xmm25[1],xmm3[1],xmm25[2],xmm3[2],xmm25[3],xmm3[3]
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm6
-; AVX512BW-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa 144(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpshufb %xmm30, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpshufb %xmm30, %xmm6, %xmm30
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm10, %xmm1
-; AVX512BW-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm14, %xmm3
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm15, %xmm3
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm28, %xmm4
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512BW-NEXT:    vpsrlq $8, %zmm5, %zmm3
-; AVX512BW-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm19, %xmm2
-; AVX512BW-NEXT:    vmovdqa64 %xmm19, %xmm20
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm11, %xmm3
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm12, %xmm4
-; AVX512BW-NEXT:    vmovdqa64 %xmm12, %xmm25
-; AVX512BW-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    vmovdqa64 %xmm29, %xmm24
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm29, %xmm13
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm23, %xmm30
-; AVX512BW-NEXT:    vmovdqa64 %xmm23, %xmm16
-; AVX512BW-NEXT:    vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    vmovdqa64 %xmm21, %xmm18
-; AVX512BW-NEXT:    vpshufb %xmm30, %xmm21, %xmm0
-; AVX512BW-NEXT:    vmovdqa64 %xmm7, %xmm17
-; AVX512BW-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpshufb %xmm30, %xmm7, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm19[0],xmm0[0],xmm19[1],xmm0[1],xmm19[2],xmm0[2],xmm19[3],xmm0[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT:    vmovdqa64 112(%rdi), %xmm27
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm27, %xmm2
+; AVX512BW-NEXT:    vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm20, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT:    vpshufb %xmm28, %xmm13, %xmm2
+; AVX512BW-NEXT:    vmovdqa64 %xmm13, %xmm24
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm28, %xmm14, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512BW-NEXT:    vpsrlq $8, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpmovqb %zmm2, %xmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm21, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm10, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm4, %xmm28
+; AVX512BW-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm28
+; AVX512BW-NEXT:    vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm20, %xmm31
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm31 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    vpshufb %xmm31, %xmm26, %xmm0
+; AVX512BW-NEXT:    vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %xmm17, %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm31, %xmm17, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm12
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm12, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm27, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm8, %xmm13
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm31, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpsrlq $16, %zmm22, %zmm13
-; AVX512BW-NEXT:    vpmovqb %zmm13, %xmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm9, %xmm0
-; AVX512BW-NEXT:    vmovdqa64 %xmm9, %xmm26
-; AVX512BW-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm21, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm13
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm29, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm7, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm11, %xmm12
+; AVX512BW-NEXT:    vmovdqa %xmm11, %xmm13
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512BW-NEXT:    vmovdqa64 (%rsp), %xmm17 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm17, %xmm12
+; AVX512BW-NEXT:    vmovdqa %xmm6, %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm6, %xmm28
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm7
+; AVX512BW-NEXT:    vpsrlq $16, %zmm29, %zmm12
+; AVX512BW-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm9, %xmm0
+; AVX512BW-NEXT:    vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm30, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; AVX512BW-NEXT:    vmovdqa64 %xmm18, %xmm16
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm18, %xmm12
+; AVX512BW-NEXT:    vmovdqa64 %xmm23, %xmm29
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm23, %xmm28
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm9, %xmm13
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm30, %xmm31, %xmm13
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm30, %xmm23, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm10, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm28, %xmm3
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512BW-NEXT:    vpsrlq $16, %zmm5, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm28
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm25, %xmm12
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm6, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512BW-NEXT:    vpshufb %xmm31, %xmm22, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm31, %xmm5, %xmm28
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm27, %xmm12
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm3
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm24, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
+; AVX512BW-NEXT:    vmovdqa64 %xmm14, %xmm27
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
+; AVX512BW-NEXT:    vpsrlq $16, %zmm8, %zmm3
 ; AVX512BW-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm20, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm21, %xmm1
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm18, %xmm2
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm25, %xmm3
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm21, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm4, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm24, %xmm4
-; AVX512BW-NEXT:    vmovdqa64 %xmm24, (%rsp) # 16-byte Spill
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm18, %xmm30
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm17, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm19, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm20, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm26, %xmm31
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm10, %xmm28
+; AVX512BW-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm28 = xmm28[0],xmm31[0],xmm28[1],xmm31[1],xmm28[2],xmm31[2],xmm28[3],xmm31[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vmovdqa %xmm12, %xmm10
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm27, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm8, %xmm13
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm8, %xmm30
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3]
-; AVX512BW-NEXT:    vpsrlq $24, %zmm6, %zmm13
-; AVX512BW-NEXT:    vpmovqb %zmm13, %xmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3]
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm23, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm13, %xmm12
+; AVX512BW-NEXT:    vmovdqa64 %xmm13, %xmm26
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm17, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm11, %xmm28
+; AVX512BW-NEXT:    vmovdqa %xmm11, %xmm14
+; AVX512BW-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512BW-NEXT:    vpsrlq $24, %zmm7, %zmm12
+; AVX512BW-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $2, %xmm4, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm26, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm21, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm22, %xmm13
-; AVX512BW-NEXT:    vmovdqa64 %xmm22, %xmm21
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm29, %xmm30
-; AVX512BW-NEXT:    vmovdqa64 %xmm29, %xmm22
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
+; AVX512BW-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm12
+; AVX512BW-NEXT:    vmovdqa64 %xmm29, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm29, %xmm28
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm9, %xmm13
-; AVX512BW-NEXT:    vmovdqa64 %xmm9, %xmm25
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX512BW-NEXT:    vmovdqa %xmm7, %xmm9
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm31, %xmm13
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm23, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm25, %xmm12
+; AVX512BW-NEXT:    vmovdqa64 %xmm25, %xmm29
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX512BW-NEXT:    vmovdqa %xmm6, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm5, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
-; AVX512BW-NEXT:    vmovdqa64 %xmm14, %xmm23
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm22, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm15, %xmm3
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm29, %xmm2
+; AVX512BW-NEXT:    vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm24, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm27, %xmm2
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm17
-; AVX512BW-NEXT:    vpsrlq $24, %zmm28, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpsrlq $24, %zmm7, %zmm2
 ; AVX512BW-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm20, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm3
-; AVX512BW-NEXT:    vmovdqa64 %xmm11, %xmm28
-; AVX512BW-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm12, %xmm4
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm27, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm18, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm21, %xmm3
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm20, %xmm4
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm25, %xmm4
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm21, %xmm28
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm10, %xmm31
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm24, %xmm13
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm26, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm18, %xmm30
-; AVX512BW-NEXT:    vmovdqa64 %xmm18, %xmm24
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm18, %xmm2
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm23, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm26, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm17, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm14, %xmm28
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3]
+; AVX512BW-NEXT:    vpsrlq $32, %zmm11, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm15[0,1],xmm4[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm4, %zmm0, %zmm4
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm13, %xmm28
+; AVX512BW-NEXT:    vmovdqa %xmm13, %xmm14
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm29, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm19, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm5, %xmm12
+; AVX512BW-NEXT:    vmovdqa64 %xmm5, %xmm19
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm3
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm24, %xmm3
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512BW-NEXT:    vpsrlq $32, %zmm7, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm17
+; AVX512BW-NEXT:    vpmovqb %zmm2, %xmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    vmovdqa64 %xmm27, %xmm24
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm27, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm18, %xmm3
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm8, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm20, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vmovdqa %xmm10, %xmm6
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm10, %xmm2
-; AVX512BW-NEXT:    vmovdqa64 %xmm27, %xmm7
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm27, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm10, %xmm13
-; AVX512BW-NEXT:    vmovdqa64 %xmm8, %xmm27
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm8, %xmm30
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vpsrlq $32, %zmm10, %zmm13
-; AVX512BW-NEXT:    vpmovqb %zmm13, %xmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    vmovdqa64 %xmm25, %xmm23
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm25, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm22, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm21, %xmm28
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm29, %xmm31
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5],ymm1[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm26, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
+; AVX512BW-NEXT:    vmovdqa64 (%rsp), %xmm26 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm26, %xmm12
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm25, %xmm28
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpsrlq $40, %zmm10, %zmm12
+; AVX512BW-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm21, %xmm13
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm30
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm20, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm14, %xmm28
+; AVX512BW-NEXT:    vmovdqa64 %xmm14, %xmm31
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm25, %xmm13
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm9, %xmm4
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm31, %xmm13
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm8, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm27, %xmm12
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm14, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm23, %xmm0
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm29, %xmm3
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm7, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm20
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpsrlq $32, %zmm17, %zmm3
+; AVX512BW-NEXT:    vpsrlq $40, %zmm17, %zmm3
 ; AVX512BW-NEXT:    vpmovqb %zmm3, %xmm3
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm28, %xmm2
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm24, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm18, %xmm2
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm12, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm4
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    vmovdqa (%rsp), %xmm15 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm26, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm24, %xmm19
-; AVX512BW-NEXT:    vmovdqa64 %xmm24, %xmm26
-; AVX512BW-NEXT:    vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm18, %xmm30
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm23, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm21, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm29, %xmm28
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm4
-; AVX512BW-NEXT:    vmovdqa64 %xmm6, %xmm24
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm5
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm27, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512BW-NEXT:    vpsrlq $40, %zmm10, %zmm5
-; AVX512BW-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm4
+; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm26, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm25, %xmm28
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3]
+; AVX512BW-NEXT:    vpsrlq $48, %zmm5, %zmm15
+; AVX512BW-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm15[0,1],xmm4[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $2, %xmm4, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm5
-; AVX512BW-NEXT:    vmovdqa64 %xmm11, %xmm17
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm21, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm22, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm20, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm15
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm31, %xmm28
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm25, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm31, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm8, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm27, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
+; AVX512BW-NEXT:    vmovdqa64 %xmm10, %xmm31
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm14, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm19, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm23, %xmm0
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm29, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm7, %xmm3
+; AVX512BW-NEXT:    vmovdqa64 %xmm7, %xmm28
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vpsrlq $40, %zmm20, %zmm2
+; AVX512BW-NEXT:    vpsrlq $48, %zmm17, %zmm2
 ; AVX512BW-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm28, %xmm1
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm5
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm15, %xmm5
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm15, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm26, %xmm19
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm18, %xmm30
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm24, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm5
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm27, %xmm5
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm30, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
-; AVX512BW-NEXT:    vpsrlq $48, %zmm10, %zmm5
-; AVX512BW-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm3
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm17, %xmm5
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm21, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm22, %xmm19
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm25, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm9, %xmm4
-; AVX512BW-NEXT:    vmovdqa64 %xmm9, %xmm18
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm31, %xmm5
-; AVX512BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm13, %xmm20, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm0, %xmm23, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
-; AVX512BW-NEXT:    vmovdqa64 %xmm6, %xmm26
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm29, %xmm2
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpsrlq $48, %zmm19, %zmm2
-; AVX512BW-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm28, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm8, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm24, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm18, %xmm3
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm12, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm5
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm8, %xmm10
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    vmovdqa (%rsp), %xmm5 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm15, %xmm8
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm6, %xmm9
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm6, %xmm13
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm24, %xmm5
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm7, %xmm9
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm27, %xmm9
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm30, %xmm6
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512BW-NEXT:    vpsrlq $56, %zmm10, %zmm6
-; AVX512BW-NEXT:    vpmovqb %zmm6, %xmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm23, %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm22, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm21, %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm29, %xmm18
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5],ymm15[6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm5, %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm30, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm26, %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm25, %xmm5
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX512BW-NEXT:    vpsrlq $56, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm5
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm5 {%k1}
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm14, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm17, %xmm6
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm21, %xmm6
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm9
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm20, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm6, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm6
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm25, %xmm6
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm18, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm27, %xmm6
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm31, %xmm4
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm31, %xmm6
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm20, %xmm8
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm14, %xmm6
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm19, %xmm7
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512BW-NEXT:    vpshufb %xmm1, %xmm11, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm23, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm13, %xmm1
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm26, %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm29, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm28, %xmm4
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512BW-NEXT:    vpsrlq $56, %zmm19, %zmm3
+; AVX512BW-NEXT:    vpsrlq $56, %zmm17, %zmm3
 ; AVX512BW-NEXT:    vpmovqb %zmm3, %xmm3
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -16066,343 +16151,343 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512BW-NEXT:    addq $744, %rsp # imm = 0x2E8
+; AVX512BW-NEXT:    addq $760, %rsp # imm = 0x2F8
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i8_stride8_vf64:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm26
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    subq $360, %rsp # imm = 0x168
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm23
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512BW-FCP-NEXT:    vmovdqa 480(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm30
-; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm30, %ymm1
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vmovdqa 448(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm12
-; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm12, %ymm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vmovdqa64 416(%rdi), %ymm22
-; AVX512BW-FCP-NEXT:    vpermd %ymm22, %ymm0, %ymm20
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm20, %ymm2
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm27
-; AVX512BW-FCP-NEXT:    vpermd %ymm27, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm25
+; AVX512BW-FCP-NEXT:    vpermd %ymm25, %ymm0, %ymm20
+; AVX512BW-FCP-NEXT:    vpshufb %ymm11, %ymm20, %ymm1
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm21 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [8,10,10,11,8,10,12,14]
+; AVX512BW-FCP-NEXT:    vpermd %zmm23, %zmm9, %zmm8
+; AVX512BW-FCP-NEXT:    vpshufb %ymm21, %ymm8, %ymm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm29
+; AVX512BW-FCP-NEXT:    vpermd %ymm29, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT:    vpshufb %ymm27, %ymm17, %ymm2
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm28 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpermd %zmm16, %zmm9, %zmm18
+; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm18, %ymm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm24
-; AVX512BW-FCP-NEXT:    vpmovqd %ymm24, %xmm18
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm18, %xmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm23
-; AVX512BW-FCP-NEXT:    vpmovqd %ymm23, %xmm17
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm13, %xmm17, %xmm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm26, %xmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm22
+; AVX512BW-FCP-NEXT:    vpmovqd %ymm22, %xmm7
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [8,10,12,14,12,14,14,15]
+; AVX512BW-FCP-NEXT:    vpermd %zmm5, %zmm10, %zmm6
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm24, %xmm3
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm19
 ; AVX512BW-FCP-NEXT:    movb $-64, %al
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm19 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa 224(%rdi), %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm30
+; AVX512BW-FCP-NEXT:    vpermd %ymm30, %ymm0, %ymm12
+; AVX512BW-FCP-NEXT:    vpshufb %ymm11, %ymm12, %ymm1
+; AVX512BW-FCP-NEXT:    vpermd %zmm14, %zmm9, %zmm11
+; AVX512BW-FCP-NEXT:    vpshufb %ymm21, %ymm11, %ymm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm26
+; AVX512BW-FCP-NEXT:    vpermd %ymm26, %ymm0, %ymm5
+; AVX512BW-FCP-NEXT:    vpshufb %ymm27, %ymm5, %ymm0
+; AVX512BW-FCP-NEXT:    vpermd %zmm13, %zmm9, %zmm9
+; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm9, %ymm14
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm21
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
 ; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm16, %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm4
-; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm31
-; AVX512BW-FCP-NEXT:    vpermd %ymm31, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm11
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm21
-; AVX512BW-FCP-NEXT:    vpermd %ymm21, %ymm0, %ymm7
-; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm0
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm28
-; AVX512BW-FCP-NEXT:    vpmovqd %ymm28, %xmm5
-; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm25
-; AVX512BW-FCP-NEXT:    vpmovqd %ymm25, %xmm11
-; AVX512BW-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm13
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm8, %xmm15
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512BW-FCP-NEXT:    vpermd %zmm21, %zmm10, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovqd %ymm1, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm10[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm10, %xmm13
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm30, %ymm0
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm12, %ymm13
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm20, %ymm13
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm31, %ymm20, %ymm0
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm13
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm15
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm17, %ymm13
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm18, %ymm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm18, %xmm13
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm1, %xmm17, %xmm15
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $8, %zmm26, %zmm15
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm27 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm7, %xmm13
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm28 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm28, %xmm6, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $8, %zmm24, %zmm15
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm13
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm16, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm4, %ymm15
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm10, %ymm15
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm31, %ymm12, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm15
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm14
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $8, %zmm8, %zmm15
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm1, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb %xmm28, %xmm2, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $8, %zmm10, %zmm15
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm13, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm30, %ymm0
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm12, %ymm13
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm20, %ymm13
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm20, %ymm0
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm13
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm17, %ymm13
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm18, %ymm14
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm18, %xmm13
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm17, %xmm15
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $16, %zmm26, %zmm15
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm28 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm28, %xmm7, %xmm13
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm31 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm31, %xmm6, %xmm14
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $16, %zmm24, %zmm14
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm13
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm16, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm15
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm15
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm7, %ymm14
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm12, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm14
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm5, %ymm14
+; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm9, %ymm15
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm5, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm15
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $16, %zmm8, %zmm15
+; AVX512BW-FCP-NEXT:    vpshufb %xmm28, %xmm1, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb %xmm31, %xmm2, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $16, %zmm10, %zmm15
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm13, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm30, %ymm0
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm20, %ymm12
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm30, %ymm9, %ymm9
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm18, %xmm12
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm17, %xmm6
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm26, %zmm12
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm12, %xmm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm20, %ymm0
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm27, %ymm8, %ymm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7]
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm17, %ymm8
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm18, %ymm13
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm18 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm18, %xmm7, %xmm7
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm6, %xmm6
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm24, %zmm7
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm7, %xmm7
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm6
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm16, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm3
-; AVX512BW-FCP-NEXT:    vpshufb %ymm30, %ymm7, %ymm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm5, %xmm1
-; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm11, %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm8, %zmm2
+; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm12, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb %ymm27, %ymm11, %ymm4
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm5, %ymm3
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm4
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm18, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm10, %zmm2
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
-; AVX512BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm2
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-FCP-NEXT:    vpermd %ymm22, %ymm6, %ymm2
-; AVX512BW-FCP-NEXT:    vpermd %ymm27, %ymm6, %ymm3
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm22, %ymm2, %ymm5
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm27, %ymm3, %ymm11
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7]
-; AVX512BW-FCP-NEXT:    vpermd %ymm24, %ymm12, %ymm4
-; AVX512BW-FCP-NEXT:    vpermd %ymm23, %ymm12, %ymm5
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm4, %xmm13
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm5, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $32, %zmm26, %zmm14
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm14 {%k1}
-; AVX512BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm13
-; AVX512BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm17, %ymm15
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512BW-FCP-NEXT:    vpermd %ymm31, %ymm6, %ymm16
-; AVX512BW-FCP-NEXT:    vpermd %ymm21, %ymm6, %ymm15
-; AVX512BW-FCP-NEXT:    vpshufb %ymm22, %ymm16, %ymm6
-; AVX512BW-FCP-NEXT:    vpshufb %ymm27, %ymm15, %ymm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
-; AVX512BW-FCP-NEXT:    vpermd %ymm28, %ymm12, %ymm13
-; AVX512BW-FCP-NEXT:    vpermd %ymm25, %ymm12, %ymm18
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm13, %xmm10
-; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm18, %xmm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $32, %zmm8, %zmm12
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm12, %xmm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm14, %zmm6
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
+; AVX512BW-FCP-NEXT:    vpermd %ymm25, %ymm6, %ymm0
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [9,11,10,11,9,11,13,15]
+; AVX512BW-FCP-NEXT:    vpermd %zmm23, %zmm13, %zmm1
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm25 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm25, %ymm1, %ymm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512BW-FCP-NEXT:    vpermd %ymm29, %ymm6, %ymm2
+; AVX512BW-FCP-NEXT:    vpermd %zmm16, %zmm13, %zmm3
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm5
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm29 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm3, %ymm7
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5],ymm5[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [9,11,13,15,13,15,14,15]
+; AVX512BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7]
+; AVX512BW-FCP-NEXT:    vpermd %ymm22, %ymm16, %ymm5
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm9
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm5, %xmm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $32, %zmm24, %zmm9
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpermd %ymm30, %ymm6, %ymm7
+; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm14
+; AVX512BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm9 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vpshufb %ymm25, %ymm9, %ymm12
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm12[7]
+; AVX512BW-FCP-NEXT:    vpermd %ymm26, %ymm6, %ymm12
+; AVX512BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vpshufb %ymm11, %ymm12, %ymm6
+; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm13, %ymm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7]
+; AVX512BW-FCP-NEXT:    vpermd %zmm21, %zmm23, %zmm14
+; AVX512BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm16 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm14, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm16, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $32, %zmm10, %zmm15
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm22
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm6
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm21, %ymm1, %ymm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm2, %ymm12
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm3, %ymm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm4, %xmm12
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm5, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $40, %zmm26, %zmm14
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm12 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm17, %ymm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm16, %ymm14
-; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm15, %ymm9
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm13, %xmm10
-; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm18, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $40, %zmm8, %zmm14
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm21
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm1, %ymm10
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm22, %ymm2, %ymm10
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm4, %xmm10
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm5, %xmm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $48, %zmm26, %zmm12
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm12, %xmm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm0, %zmm10
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm10 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm9
-; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm17, %ymm12
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm22, %ymm16, %ymm12
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm13, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm18, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $48, %zmm8, %zmm14
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512BW-FCP-NEXT:    vpshufb %ymm30, %ymm3, %ymm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm2, %ymm8
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm26 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm26, %ymm3, %ymm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm4, %xmm8
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm25 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm5, %xmm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $40, %zmm24, %zmm11
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm11, %xmm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm6
+; AVX512BW-FCP-NEXT:    vpshufb %ymm21, %ymm9, %ymm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm12, %ymm11
+; AVX512BW-FCP-NEXT:    vpshufb %ymm26, %ymm13, %ymm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm14, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm16, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $40, %zmm10, %zmm15
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm21
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm6
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm1, %ymm8
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm26, %ymm2, %ymm8
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm3, %ymm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm31, %xmm4, %xmm8
+; AVX512BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm5, %xmm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $48, %zmm24, %zmm11
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm11, %xmm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm6
+; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm9, %ymm11
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm26, %ymm12, %ymm11
+; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm13, %ymm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm31, %xmm14, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm16, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT:    vpsrlq $48, %zmm10, %zmm15
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb %ymm27, %ymm1, %ymm1
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm2, %ymm1
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm3, %ymm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm4, %xmm1
-; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm5, %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm4, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb %xmm18, %xmm5, %xmm2
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $56, %zmm26, %zmm2
+; AVX512BW-FCP-NEXT:    vpsrlq $56, %zmm24, %zmm2
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm12, %ymm17, %ymm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm16, %ymm2
-; AVX512BW-FCP-NEXT:    vpshufb %ymm30, %ymm15, %ymm3
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm7, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb %ymm27, %ymm9, %ymm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm12, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm13, %ymm3
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm13, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm18, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm14, %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb %xmm18, %xmm16, %xmm3
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512BW-FCP-NEXT:    vpsrlq $56, %zmm8, %zmm3
+; AVX512BW-FCP-NEXT:    vpsrlq $56, %zmm10, %zmm3
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm3, %xmm3
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
@@ -16413,671 +16498,674 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm1, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm1, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovups (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovaps %zmm1, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, (%r9)
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, (%rax)
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512BW-FCP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX512BW-FCP-NEXT:    addq $360, %rsp # imm = 0x168
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i8_stride8_vf64:
 ; AVX512DQ-BW:       # %bb.0:
-; AVX512DQ-BW-NEXT:    subq $744, %rsp # imm = 0x2E8
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 496(%rdi), %xmm24
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm24, %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 480(%rdi), %xmm25
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm25, %xmm4
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 464(%rdi), %xmm26
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm26, %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm30, %xmm6
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT:    vmovdqa 384(%rdi), %ymm4
-; AVX512DQ-BW-NEXT:    vpmovqb %ymm4, %xmm4
+; AVX512DQ-BW-NEXT:    subq $760, %rsp # imm = 0x2F8
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm19
+; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm9
+; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm29
+; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm6
+; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa 464(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm2, %xmm5
+; AVX512DQ-BW-NEXT:    vmovdqa 448(%rdi), %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm1, %xmm7
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-NEXT:    vmovdqa 480(%rdi), %xmm10
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm10, %xmm7
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm4, %xmm4
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 368(%rdi), %xmm31
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm31, %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 352(%rdi), %xmm27
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm27, %xmm6
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 336(%rdi), %xmm22
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm22, %xmm11
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa 320(%rdi), %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm9, %xmm15
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm1, %xmm11
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm15
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm10, %zmm0, %zmm20
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-BW-NEXT:    vmovdqa 384(%rdi), %ymm5
+; AVX512DQ-BW-NEXT:    vpmovqb %ymm5, %xmm5
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-NEXT:    vmovdqa 416(%rdi), %ymm7
+; AVX512DQ-BW-NEXT:    vpmovqb %ymm7, %xmm8
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpmovqb %ymm7, %xmm7
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5],ymm5[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 336(%rdi), %xmm31
+; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm31, %xmm8
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm31, (%rsp) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %xmm23
+; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm23, %xmm11
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX512DQ-BW-NEXT:    vmovdqa 352(%rdi), %xmm4
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm4, %xmm13
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm6, %xmm6
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3]
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm29, %xmm11
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm16
 ; AVX512DQ-BW-NEXT:    movb $-64, %al
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 240(%rdi), %xmm28
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm28, %xmm7
-; AVX512DQ-BW-NEXT:    vmovdqa64 224(%rdi), %xmm18
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm18, %xmm10
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-NEXT:    vmovdqa64 208(%rdi), %xmm17
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm17, %xmm10
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm8, %xmm16
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm16 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa 208(%rdi), %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm0, %xmm11
+; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm0, %xmm13
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT:    vmovdqa64 224(%rdi), %xmm30
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm30, %xmm17
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm17[0],xmm9[0],xmm17[1],xmm9[1],xmm17[2],xmm9[2],xmm17[3],xmm9[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512DQ-BW-NEXT:    vpmovqb %ymm11, %xmm11
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT:    vmovdqa64 160(%rdi), %ymm17
+; AVX512DQ-BW-NEXT:    vpmovqb %ymm17, %xmm18
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vpmovqb %ymm17, %xmm17
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm9[6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm13, %xmm17
+; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm0, %xmm14
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 96(%rdi), %xmm20
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm20, %xmm15
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm19, %xmm17
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3]
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm3, %xmm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %xmm5
+; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %xmm8
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm16[0],xmm10[0],xmm16[1],xmm10[1],xmm16[2],xmm10[2],xmm16[3],xmm10[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm10
-; AVX512DQ-BW-NEXT:    vpmovqb %ymm10, %xmm10
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa 112(%rdi), %xmm10
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm10, %xmm16
-; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %xmm14
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm14, %xmm12
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3]
-; AVX512DQ-BW-NEXT:    vmovdqa 80(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vmovdqa 240(%rdi), %xmm9
+; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %xmm17
+; AVX512DQ-BW-NEXT:    vmovdqa64 400(%rdi), %xmm26
+; AVX512DQ-BW-NEXT:    vmovdqa 416(%rdi), %xmm6
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 432(%rdi), %xmm19
+; AVX512DQ-BW-NEXT:    vmovdqa64 496(%rdi), %xmm21
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm21, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm10, %xmm22
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm22[0],xmm12[0],xmm22[1],xmm12[1],xmm22[2],xmm12[2],xmm22[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm28 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm2, %xmm16
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm2, %xmm21
-; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm28, %xmm2, %xmm22
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm28, %xmm1, %xmm24
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm25 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm25, %xmm19, %xmm22
+; AVX512DQ-BW-NEXT:    vpshufb %xmm25, %xmm6, %xmm24
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm27, %xmm26, %xmm22
+; AVX512DQ-BW-NEXT:    vpshufb %xmm27, %xmm17, %xmm24
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5],ymm1[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa 368(%rdi), %xmm7
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm28, %xmm31, %xmm15
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm23, %xmm6
+; AVX512DQ-BW-NEXT:    vpshufb %xmm28, %xmm23, %xmm22
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm22[0],xmm15[0],xmm22[1],xmm15[1],xmm22[2],xmm15[2],xmm22[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $8, %zmm29, %zmm15
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm12, %zmm0, %zmm12
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm28, %xmm18, %xmm15
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm28, %xmm23, %xmm22
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm22[0],xmm15[0],xmm22[1],xmm15[1],xmm22[2],xmm15[2],xmm22[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vmovdqa 176(%rdi), %xmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm2, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm5, %xmm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %xmm6
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa 384(%rdi), %xmm7
-; AVX512DQ-BW-NEXT:    vmovdqa64 400(%rdi), %xmm21
-; AVX512DQ-BW-NEXT:    vmovdqa64 416(%rdi), %xmm23
-; AVX512DQ-BW-NEXT:    vmovdqa64 432(%rdi), %xmm29
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm24, %xmm19
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm24, %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm25, %xmm11
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm25, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm26, %xmm12
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm26, %xmm24
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm30, %xmm16
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm30, %xmm25
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm26, %xmm29, %xmm24
-; AVX512DQ-BW-NEXT:    vpshufb %xmm26, %xmm23, %xmm25
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm30, %xmm21, %xmm24
-; AVX512DQ-BW-NEXT:    vpshufb %xmm30, %xmm7, %xmm25
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm25, %xmm2, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm25, %xmm8, %xmm25
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm25[0],xmm15[0],xmm25[1],xmm15[1],xmm25[2],xmm15[2],xmm25[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 144(%rdi), %xmm22
+; AVX512DQ-BW-NEXT:    vpshufb %xmm27, %xmm22, %xmm31
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm27, %xmm5, %xmm27
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm27 = xmm27[0],xmm31[0],xmm27[1],xmm31[1],xmm27[2],xmm31[2],xmm27[3],xmm31[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm2
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm31, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm27, %xmm3
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm22, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm9, %xmm24
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm9, %xmm31
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $8, %zmm15, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm22
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm28, %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm28, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm18, %xmm3
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm17, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm8, %xmm24
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vmovdqa 176(%rdi), %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 112(%rdi), %xmm27
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm27, %xmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm20, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm28, %xmm13, %xmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm13, %xmm24
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm28, %xmm14, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $8, %zmm3, %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm8
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm21, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm10, %xmm2
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm4, %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm26, %xmm0, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm26, %xmm6, %xmm25
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm25[0],xmm3[0],xmm25[1],xmm3[1],xmm25[2],xmm3[2],xmm25[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %xmm6
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa 144(%rdi), %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpshufb %xmm30, %xmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vpshufb %xmm30, %xmm6, %xmm30
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm20, %xmm31
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm31 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm31, %xmm26, %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm17, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm31, %xmm17, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm12
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm10, %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm14, %xmm3
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm15, %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm28, %xmm4
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $8, %zmm5, %zmm3
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm19, %xmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm19, %xmm20
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm11, %xmm3
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm12, %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm12, %xmm25
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm29, %xmm24
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm29, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm23, %xmm30
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm23, %xmm16
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm21, %xmm18
-; AVX512DQ-BW-NEXT:    vpshufb %xmm30, %xmm21, %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm7, %xmm17
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpshufb %xmm30, %xmm7, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm19[0],xmm0[0],xmm19[1],xmm0[1],xmm19[2],xmm0[2],xmm19[3],xmm0[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm7, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm11, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm11, %xmm13
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsp), %xmm17 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm17, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm6, %xmm28
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm29, %zmm7
+; AVX512DQ-BW-NEXT:    vpsrlq $16, %zmm29, %zmm12
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm9, %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm30, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm18, %xmm16
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm18, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm23, %xmm29
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm23, %xmm28
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm12, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm27, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm8, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm31, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vpsrlq $16, %zmm22, %zmm13
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm13, %xmm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm9, %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm9, %xmm26
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm21, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm25, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm6, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm31, %xmm22, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm31, %xmm5, %xmm28
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm27, %xmm12
 ; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm29, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm9, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm30, %xmm31, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm30, %xmm23, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm10, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm28, %xmm3
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $16, %zmm5, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm28
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm3
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm24, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm14, %xmm27
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $16, %zmm8, %zmm3
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm20, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm21, %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm18, %xmm2
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm25, %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm21, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm4, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm24, %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm24, (%rsp) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm18, %xmm30
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm17, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm19, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm20, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm26, %xmm31
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm10, %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm28 = xmm28[0],xmm31[0],xmm28[1],xmm31[1],xmm28[2],xmm31[2],xmm28[3],xmm31[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm12, %xmm10
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm27, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm8, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm8, %xmm30
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $24, %zmm6, %zmm13
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm13, %xmm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm23, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm13, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm13, %xmm26
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm17, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm11, %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm11, %xmm14
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQ-BW-NEXT:    vpsrlq $24, %zmm7, %zmm12
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm4, %zmm0, %zmm4
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm26, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm21, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm22, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm22, %xmm21
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm29, %xmm30
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm29, %xmm22
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm29, %xmm13
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm29, %xmm28
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm9, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm9, %xmm25
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm7, %xmm9
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm31, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm23, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm25, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm25, %xmm29
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, %xmm8
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm5, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm14, %xmm23
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm22, %xmm0
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm15, %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm29, %xmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm24, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm27, %xmm2
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm17
-; AVX512DQ-BW-NEXT:    vpsrlq $24, %zmm28, %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-BW-NEXT:    vpsrlq $24, %zmm7, %zmm2
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm20, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm11, %xmm28
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm12, %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm27, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm18, %xmm2
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm21, %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm20, %xmm4
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm25, %xmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm21, %xmm28
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm10, %xmm31
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm24, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm26, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm18, %xmm30
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm18, %xmm24
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm18, %xmm2
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm23, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm26, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm17, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm14, %xmm28
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm11, %zmm15
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm15[0,1],xmm4[2,3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm4, %zmm0, %zmm4
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm13, %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm13, %xmm14
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm29, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm5, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm5, %xmm19
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm24, %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm7, %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm17
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm27, %xmm24
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm27, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm18, %xmm3
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm8, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm20, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm10, %xmm6
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm10, %xmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm27, %xmm7
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm27, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm10, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm8, %xmm27
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm8, %xmm30
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
-; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm10, %zmm13
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm13, %xmm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm25, %xmm23
+; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm25, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm22, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm21, %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm29, %xmm31
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5],ymm1[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm26, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsp), %xmm26 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm26, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm25, %xmm28
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vpsrlq $40, %zmm10, %zmm12
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm21, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm30
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm20, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm14, %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm14, %xmm31
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm25, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm9, %xmm4
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm31, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm8, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm27, %xmm12
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm14, %xmm12
+; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm23, %xmm0
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm29, %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm7, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm20
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm17, %zmm3
+; AVX512DQ-BW-NEXT:    vpsrlq $40, %zmm17, %zmm3
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm3, %xmm3
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm28, %xmm2
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm24, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm18, %xmm2
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm12, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm4
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsp), %xmm15 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm26, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm24, %xmm19
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm24, %xmm26
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm18, %xmm30
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm23, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm21, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm29, %xmm28
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm6, %xmm24
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm5
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm27, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $40, %zmm10, %zmm5
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm30, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm26, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm25, %xmm28
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $48, %zmm5, %zmm15
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm15[0,1],xmm4[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm4, %zmm0, %zmm4
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm11, %xmm17
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm21, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm22, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm20, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm15
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm31, %xmm28
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm25, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm31, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm8, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm27, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm10, %xmm31
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm14, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm19, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm23, %xmm0
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm29, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm7, %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm7, %xmm28
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $40, %zmm20, %zmm2
+; AVX512DQ-BW-NEXT:    vpsrlq $48, %zmm17, %zmm2
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm28, %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm16, %xmm5
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm15, %xmm5
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm15, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm26, %xmm19
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm18, %xmm30
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm24, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm7, %xmm5
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm27, %xmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm30, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $48, %zmm10, %zmm5
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm3
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm17, %xmm5
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm21, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm22, %xmm19
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm25, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm9, %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm9, %xmm18
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm31, %xmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm20, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm23, %xmm0
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm6, %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm6, %xmm26
-; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm29, %xmm2
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-BW-NEXT:    vpsrlq $48, %zmm19, %zmm2
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm28, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm8, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm24, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm18, %xmm3
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm12, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm5
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm8, %xmm10
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsp), %xmm5 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm15, %xmm8
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
-; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm6, %xmm9
-; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm6, %xmm13
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm24, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm7, %xmm9
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm27, %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm30, %xmm6
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $56, %zmm10, %zmm6
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm6, %xmm6
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm23, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm22, %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm21, %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm29, %xmm18
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm5, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm30, %xmm8
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm26, %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm25, %xmm5
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX512DQ-BW-NEXT:    vpsrlq $56, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm5
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm14, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm17, %xmm6
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm21, %xmm6
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm22, %xmm9
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm20, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm6, %xmm8
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm16, %xmm6
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm7, %xmm7
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm25, %xmm6
-; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm18, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm27, %xmm6
+; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm31, %xmm4
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm31, %xmm6
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm20, %xmm8
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm14, %xmm6
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm19, %xmm7
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm11, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm23, %xmm1
+; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm13, %xmm1
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm26, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm29, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm28, %xmm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512DQ-BW-NEXT:    vpsrlq $56, %zmm19, %zmm3
+; AVX512DQ-BW-NEXT:    vpsrlq $56, %zmm17, %zmm3
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm3, %xmm3
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -17099,343 +17187,343 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512DQ-BW-NEXT:    addq $744, %rsp # imm = 0x2E8
+; AVX512DQ-BW-NEXT:    addq $760, %rsp # imm = 0x2F8
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm26
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    subq $360, %rsp # imm = 0x168
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 480(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm30
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm30, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 448(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm12, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 416(%rdi), %ymm22
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm22, %ymm0, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm20, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm27
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm27, %ymm0, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %ymm25
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm25, %ymm0, %ymm20
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm11, %ymm20, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm21 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm23, %zmm9, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm21, %ymm8, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm29
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm29, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm27, %ymm17, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm28 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm16, %zmm9, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm18, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm24
-; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm24, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm18, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm23, %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm13, %xmm17, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm26, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm22, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [8,10,12,14,12,14,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm5, %zmm10, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm24, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm19 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 224(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm30
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm30, %ymm0, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm11, %ymm12, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm14, %zmm9, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm21, %ymm11, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm26
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm26, %ymm0, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm27, %ymm5, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm13, %zmm9, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm9, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm16, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm31
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm31, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm21
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm21, %ymm0, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm28
-; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm28, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm25
-; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm25, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm8, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm21, %zmm10, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovqd %ymm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm10[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm10, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm30, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm12, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm20, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm31, %ymm20, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm17, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm18, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm18, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm1, %xmm17, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $8, %zmm26, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm27 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm7, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm28 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm28, %xmm6, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $8, %zmm24, %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm16, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm4, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm10, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm31, %ymm12, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $8, %zmm8, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm1, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm28, %xmm2, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $8, %zmm10, %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm13, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm30, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm12, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm20, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm20, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm17, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm18, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm18, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm17, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %zmm26, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm28 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm28, %xmm7, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm31 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm31, %xmm6, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %zmm24, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm16, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm7, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm12, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm5, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm9, %ymm15
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm5, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %zmm8, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm28, %xmm1, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm31, %xmm2, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %zmm10, %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm13, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm30, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm20, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm30, %ymm9, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm18, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm17, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm26, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm20, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm27, %ymm8, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm17, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm18, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm18 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm18, %xmm7, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm24, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm7, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm0, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm16, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm30, %ymm7, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm5, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm11, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm8, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm12, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm27, %ymm11, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm5, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm18, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm10, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm22, %ymm6, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm27, %ymm6, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm22, %ymm2, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm27, %ymm3, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm24, %ymm12, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm23, %ymm12, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm4, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm24, %xmm5, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %zmm26, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm17, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm31, %ymm6, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm21, %ymm6, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm22, %ymm16, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm27, %ymm15, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm28, %ymm12, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm25, %ymm12, %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm13, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm24, %xmm18, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %zmm8, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm14, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm25, %ymm6, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm23, %zmm13, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm25 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm25, %ymm1, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm29, %ymm6, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm16, %zmm13, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm29 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm3, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm22, %ymm16, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm22, %xmm5, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %zmm24, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm30, %ymm6, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm25, %ymm9, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm26, %ymm6, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm11, %ymm12, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm13, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm21, %zmm23, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm16 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm14, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm22, %xmm16, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %zmm10, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm21, %ymm1, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm2, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm3, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm4, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm22, %xmm5, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $40, %zmm26, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm17, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm16, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm15, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm13, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm22, %xmm18, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $40, %zmm8, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm1, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm22, %ymm2, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm24, %xmm4, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm5, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %zmm26, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm0, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm17, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm22, %ymm16, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm24, %xmm13, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm18, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %zmm8, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm9, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm30, %ymm3, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm2, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm26 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm26, %ymm3, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm4, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm25 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm5, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $40, %zmm24, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm21, %ymm9, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm12, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm26, %ymm13, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm14, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm16, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $40, %zmm10, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm1, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm26, %ymm2, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm3, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm31, %xmm4, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm5, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %zmm24, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm9, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm26, %ymm12, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm13, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm31, %xmm14, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm16, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %zmm10, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm27, %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm2, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm3, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm4, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm5, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm4, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm18, %xmm5, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $56, %zmm26, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $56, %zmm24, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm12, %ymm17, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm16, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm30, %ymm15, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm7, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm27, %ymm9, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm12, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm13, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm13, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm18, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm14, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm18, %xmm16, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $56, %zmm8, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $56, %zmm10, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm3, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
@@ -17446,16 +17534,16 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm1, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm1, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovups (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps %zmm1, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX512DQ-BW-FCP-NEXT:    addq $360, %rsp # imm = 0x168
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <512 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 6d1ba933b9082..b64513df56946 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -1649,301 +1649,313 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride3_vf32:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm0
 ; AVX512-NEXT:    vmovdqa 16(%rsi), %xmm3
 ; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512-NEXT:    vprold $16, %xmm2, %xmm4
+; AVX512-NEXT:    vprold $16, %xmm2, %xmm5
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX512-NEXT:    vprold $16, %xmm3, %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512-NEXT:    vpermd (%rdx), %zmm4, %zmm5
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm8
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
-; AVX512-NEXT:    vpor %ymm3, %ymm8, %ymm3
-; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm8
-; AVX512-NEXT:    vmovdqa 48(%rsi), %xmm10
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm11, %xmm7
-; AVX512-NEXT:    vprold $16, %xmm10, %xmm10
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm7
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm8, %ymm11
-; AVX512-NEXT:    vpermd %ymm8, %ymm4, %ymm4
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm11, %zmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem)
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX512-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
-; AVX512-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX512-NEXT:    vprold $16, %xmm0, %xmm6
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm3[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [5,5,0,6,6,0,7,7]
+; AVX512-NEXT:    vpermd %ymm3, %ymm6, %ymm6
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,8,8,0,9,9,0,10]
+; AVX512-NEXT:    vpermd %zmm4, %zmm7, %zmm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm9
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
+; AVX512-NEXT:    vpor %ymm5, %ymm9, %ymm5
+; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm9
+; AVX512-NEXT:    vmovdqa 48(%rsi), %xmm11
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm12, %xmm8
+; AVX512-NEXT:    vprold $16, %xmm11, %xmm11
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [10,0,11,11,0,12,12,0]
+; AVX512-NEXT:    vpermd %zmm4, %zmm8, %zmm8
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [13,13,0,14,14,0,15,15]
+; AVX512-NEXT:    vpermd %zmm4, %zmm9, %zmm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm8, %zmm4
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5))
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm7
+; AVX512-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
+; AVX512-NEXT:    vpor %ymm5, %ymm7, %ymm5
+; AVX512-NEXT:    vprold $16, %xmm0, %xmm7
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm7, %ymm1
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
-; AVX512-NEXT:    vpermd %ymm7, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpandn %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm3)
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,0,2]
+; AVX512-NEXT:    vpermd %ymm3, %ymm1, %ymm1
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpandn %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[10,11],zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,ymm3[18,19],zero,zero,zero,zero
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm2)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm4, 128(%rcx)
-; AVX512-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm6, 64(%rcx)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride3_vf32:
 ; AVX512-FCP:       # %bb.0:
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rsi), %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512-FCP-NEXT:    vprold $16, %xmm2, %xmm4
+; AVX512-FCP-NEXT:    vprold $16, %xmm2, %xmm5
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX512-FCP-NEXT:    vprold $16, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512-FCP-NEXT:    vpermd (%rdx), %zmm4, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm8, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm8
-; AVX512-FCP-NEXT:    vmovdqa 48(%rsi), %xmm10
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm7
-; AVX512-FCP-NEXT:    vprold $16, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm7
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm11
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm11, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem)
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX512-FCP-NEXT:    vprold $16, %xmm0, %xmm6
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm3[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [5,5,0,6,6,0,7,7]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,8,8,0,9,9,0,10]
+; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm9
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm9, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa 48(%rsi), %xmm11
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm12, %xmm8
+; AVX512-FCP-NEXT:    vprold $16, %xmm11, %xmm11
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [10,0,11,11,0,12,12,0]
+; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm8, %zmm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [13,13,0,14,14,0,15,15]
+; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm9, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm8, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5))
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm7, %ymm5
+; AVX512-FCP-NEXT:    vprold $16, %xmm0, %xmm7
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpandn %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm3)
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,0,2]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vpandn %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[10,11],zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,ymm3[18,19],zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm2)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rcx)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride3_vf32:
 ; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 16(%rsi), %xmm3
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm4
+; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm5
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX512DQ-NEXT:    vprold $16, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512DQ-NEXT:    vpermd (%rdx), %zmm4, %zmm5
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm8
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
-; AVX512DQ-NEXT:    vpor %ymm3, %ymm8, %ymm3
-; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm8
-; AVX512DQ-NEXT:    vmovdqa 48(%rsi), %xmm10
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm11, %xmm7
-; AVX512DQ-NEXT:    vprold $16, %xmm10, %xmm10
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm7
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm8, %ymm11
-; AVX512DQ-NEXT:    vpermd %ymm8, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm11, %zmm4
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem)
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
-; AVX512DQ-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX512DQ-NEXT:    vprold $16, %xmm0, %xmm6
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm3[0,1,2,3],zmm5[0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [5,5,0,6,6,0,7,7]
+; AVX512DQ-NEXT:    vpermd %ymm3, %ymm6, %ymm6
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,8,8,0,9,9,0,10]
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm9
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
+; AVX512DQ-NEXT:    vpor %ymm5, %ymm9, %ymm5
+; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm9
+; AVX512DQ-NEXT:    vmovdqa 48(%rsi), %xmm11
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm12, %xmm8
+; AVX512DQ-NEXT:    vprold $16, %xmm11, %xmm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [10,0,11,11,0,12,12,0]
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm8, %zmm8
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [13,13,0,14,14,0,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm9, %zmm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm8, %zmm4
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5))
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm7
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
+; AVX512DQ-NEXT:    vpor %ymm5, %ymm7, %ymm5
+; AVX512DQ-NEXT:    vprold $16, %xmm0, %xmm7
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm7, %ymm1
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
-; AVX512DQ-NEXT:    vpermd %ymm7, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpandn %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm3)
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,0,2]
+; AVX512DQ-NEXT:    vpermd %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpandn %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[10,11],zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,ymm3[18,19],zero,zero,zero,zero
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm2)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 128(%rcx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 64(%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride3_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rsi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm2, %xmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX512DQ-FCP-NEXT:    vprold $16, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512DQ-FCP-NEXT:    vpermd (%rdx), %zmm4, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rsi), %xmm10
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm7
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm11
-; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem)
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm0, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm3[0,1,2,3],zmm5[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [5,5,0,6,6,0,7,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,8,8,0,9,9,0,10]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm9, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rsi), %xmm11
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm12, %xmm8
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [10,0,11,11,0,12,12,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [13,13,0,14,14,0,15,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm9, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm8, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm0, %xmm7
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpandn %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm3)
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,0,2]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpandn %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[10,11],zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,ymm3[18,19],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm2)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rcx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -3069,143 +3081,151 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride3_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm3
+; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm20
+; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm15
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm3
 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512-NEXT:    vmovdqa64 16(%rsi), %xmm20
-; AVX512-NEXT:    vmovdqa64 32(%rsi), %xmm24
-; AVX512-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512-NEXT:    vmovdqa64 16(%rdi), %xmm21
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm14
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm11
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2]
-; AVX512-NEXT:    vpermd %ymm3, %ymm16, %ymm3
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpandn %ymm3, %ymm15, %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm3, %zmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15)
+; AVX512-NEXT:    vmovdqa64 16(%rsi), %xmm24
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512-NEXT:    vprold $16, %xmm5, %xmm7
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512-NEXT:    vmovdqa64 16(%rdi), %xmm25
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm5, %ymm5
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm5[0,1,2,3],zmm3[0,1,2,3]
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm5
+; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm12
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm5, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm27
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,1,0,2]
+; AVX512-NEXT:    vpermd %ymm5, %ymm19, %ymm11
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpandnq %ymm11, %ymm17, %ymm11
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm11, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm17)
 ; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm10
 ; AVX512-NEXT:    vprold $16, %xmm10, %xmm11
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm12
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2]
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm26
 ; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm12
-; AVX512-NEXT:    vmovdqa 80(%rsi), %xmm13
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX512-NEXT:    vmovdqa 80(%rsi), %xmm14
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
 ; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-NEXT:    vprold $16, %xmm13, %xmm13
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm12, %ymm0
+; AVX512-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm13, %ymm0
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512-NEXT:    vpermd 64(%rdx), %zmm18, %zmm10
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0))
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [5,5,0,6,6,0,7,7]
+; AVX512-NEXT:    vpermd %ymm12, %ymm16, %ymm10
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,8,8,0,9,9,0,10]
+; AVX512-NEXT:    vpermd %zmm15, %zmm18, %zmm13
+; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm10, %zmm10
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm21 & (zmm10 ^ zmm0))
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm0
-; AVX512-NEXT:    vmovdqa %ymm6, %ymm2
-; AVX512-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm5
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm13
-; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
-; AVX512-NEXT:    vpor %ymm0, %ymm5, %ymm0
-; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm5
-; AVX512-NEXT:    vmovdqa 112(%rsi), %xmm12
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm1
+; AVX512-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm13
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
+; AVX512-NEXT:    vpshufb %ymm9, %ymm13, %ymm13
+; AVX512-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm13
+; AVX512-NEXT:    vmovdqa 112(%rsi), %xmm14
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX512-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
-; AVX512-NEXT:    vprold $16, %xmm12, %xmm12
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm5
-; AVX512-NEXT:    vpermd %ymm5, %ymm18, %ymm7
-; AVX512-NEXT:    vpandnq %ymm7, %ymm22, %ymm7
-; AVX512-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm17
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19)
+; AVX512-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm13, %ymm7
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[0,1,2,3]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [10,0,11,11,0,12,12,0]
+; AVX512-NEXT:    vpermd %zmm15, %zmm22, %zmm13
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [13,13,0,14,14,0,15,15]
+; AVX512-NEXT:    vpermd %zmm15, %zmm23, %zmm15
+; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm13, %zmm15
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm13 & (zmm15 ^ zmm0))
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm7
-; AVX512-NEXT:    vpshufb %ymm13, %ymm7, %ymm7
-; AVX512-NEXT:    vpor %ymm0, %ymm7, %ymm0
-; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512-NEXT:    vprold $16, %xmm7, %xmm12
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm14, %ymm5
-; AVX512-NEXT:    vpermd %ymm14, %ymm16, %ymm6
-; AVX512-NEXT:    vpandn %ymm6, %ymm15, %ymm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15)
+; AVX512-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm8
+; AVX512-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
+; AVX512-NEXT:    vpor %ymm0, %ymm8, %ymm0
+; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm8
+; AVX512-NEXT:    vprold $16, %xmm8, %xmm7
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm14
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm14
+; AVX512-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[0,1,2,3]
+; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm7
+; AVX512-NEXT:    vpshufb %ymm7, %ymm12, %ymm7
+; AVX512-NEXT:    vpermd %ymm12, %ymm19, %ymm8
+; AVX512-NEXT:    vpandnq %ymm8, %ymm17, %ymm8
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm0 & zmm17)
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm6
-; AVX512-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
-; AVX512-NEXT:    vpor %ymm0, %ymm6, %ymm0
-; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm6
-; AVX512-NEXT:    vmovdqa 48(%rsi), %xmm7
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
+; AVX512-NEXT:    vpor %ymm0, %ymm8, %ymm0
+; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm8
+; AVX512-NEXT:    vmovdqa 48(%rsi), %xmm9
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
 ; AVX512-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
-; AVX512-NEXT:    vprold $16, %xmm7, %xmm7
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3]
-; AVX512-NEXT:    vpermd %ymm8, %ymm18, %ymm6
-; AVX512-NEXT:    vpandnq %ymm6, %ymm22, %ymm6
-; AVX512-NEXT:    vpshufb %ymm9, %ymm8, %ymm7
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19)
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512-NEXT:    vprold $16, %xmm24, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm1
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512-NEXT:    vprold $16, %xmm20, %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm8, %ymm8
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-NEXT:    vpermd %zmm20, %zmm22, %zmm8
+; AVX512-NEXT:    vpermd %zmm20, %zmm23, %zmm9
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm13 & (zmm8 ^ zmm0))
+; AVX512-NEXT:    vprold $16, %xmm4, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2],xmm9[3,4],xmm0[5],xmm9[6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
+; AVX512-NEXT:    vprold $16, %xmm24, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm25[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-NEXT:    vpermd (%rdx), %zmm18, %zmm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0))
+; AVX512-NEXT:    vpermd %ymm5, %ymm16, %ymm1
+; AVX512-NEXT:    vpermd %zmm20, %zmm18, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm0))
 ; AVX512-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
-; AVX512-NEXT:    vmovdqa64 %zmm6, 128(%rcx)
-; AVX512-NEXT:    vmovdqa64 %zmm5, 192(%rcx)
-; AVX512-NEXT:    vmovdqa64 %zmm17, 320(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm8, 128(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm7, 192(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm15, 320(%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm10, 256(%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm3, (%rcx)
 ; AVX512-NEXT:    vzeroupper
@@ -3213,143 +3233,151 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-FCP-LABEL: store_i16_stride3_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm20
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm15
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm20
-; AVX512-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm24
-; AVX512-FCP-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm21
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm14
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm11
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm16, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpandn %ymm3, %ymm15, %ymm3
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15)
+; AVX512-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm24
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512-FCP-NEXT:    vprold $16, %xmm5, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm25
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm5[0,1,2,3],zmm3[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm27
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,1,0,2]
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm19, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vpandnq %ymm11, %ymm17, %ymm11
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm11, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm17)
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm10
 ; AVX512-FCP-NEXT:    vprold $16, %xmm10, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm26
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vmovdqa 80(%rsi), %xmm13
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa 80(%rsi), %xmm14
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vprold $16, %xmm13, %xmm13
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm13, %ymm0
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512-FCP-NEXT:    vpermd 64(%rdx), %zmm18, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0))
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [5,5,0,6,6,0,7,7]
+; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm16, %ymm10
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,8,8,0,9,9,0,10]
+; AVX512-FCP-NEXT:    vpermd %zmm15, %zmm18, %zmm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm10, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm21 & (zmm10 ^ zmm0))
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm13
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 112(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa 112(%rsi), %xmm12
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm13
+; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm13
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 112(%rdi), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa 112(%rsi), %xmm14
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vprold $16, %xmm12, %xmm12
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm5
-; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm18, %ymm7
-; AVX512-FCP-NEXT:    vpandnq %ymm7, %ymm22, %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19)
+; AVX512-FCP-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm13, %ymm7
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[0,1,2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [10,0,11,11,0,12,12,0]
+; AVX512-FCP-NEXT:    vpermd %zmm15, %zmm22, %zmm13
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [13,13,0,14,14,0,15,15]
+; AVX512-FCP-NEXT:    vpermd %zmm15, %zmm23, %zmm15
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm13, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm13 & (zmm15 ^ zmm0))
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm7, %ymm7
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512-FCP-NEXT:    vprold $16, %xmm7, %xmm12
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm5
-; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm16, %ymm6
-; AVX512-FCP-NEXT:    vpandn %ymm6, %ymm15, %ymm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15)
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm8
+; AVX512-FCP-NEXT:    vprold $16, %xmm8, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm14
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm14
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm7
+; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm19, %ymm8
+; AVX512-FCP-NEXT:    vpandnq %ymm8, %ymm17, %ymm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm0 & zmm17)
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm6, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa 48(%rsi), %xmm7
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa 48(%rsi), %xmm9
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
-; AVX512-FCP-NEXT:    vprold $16, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3]
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm18, %ymm6
-; AVX512-FCP-NEXT:    vpandnq %ymm6, %ymm22, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm7
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19)
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512-FCP-NEXT:    vprold $16, %xmm24, %xmm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vprold $16, %xmm20, %xmm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm8, %ymm8
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm22, %zmm8
+; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm23, %zmm9
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm13 & (zmm8 ^ zmm0))
+; AVX512-FCP-NEXT:    vprold $16, %xmm4, %xmm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2],xmm9[3,4],xmm0[5],xmm9[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vprold $16, %xmm24, %xmm1
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm25[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-FCP-NEXT:    vpermd (%rdx), %zmm18, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0))
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm16, %ymm1
+; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm18, %zmm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm0))
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 320(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 192(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, 320(%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 256(%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rcx)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -3357,143 +3385,151 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512DQ-LABEL: store_i16_stride3_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm20
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm15
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 16(%rsi), %xmm20
-; AVX512DQ-NEXT:    vmovdqa64 32(%rsi), %xmm24
-; AVX512DQ-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 16(%rdi), %xmm21
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm14
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm11
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2]
-; AVX512DQ-NEXT:    vpermd %ymm3, %ymm16, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpandn %ymm3, %ymm15, %ymm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15)
+; AVX512DQ-NEXT:    vmovdqa64 16(%rsi), %xmm24
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-NEXT:    vprold $16, %xmm5, %xmm7
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512DQ-NEXT:    vmovdqa64 16(%rdi), %xmm25
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm5[0,1,2,3],zmm3[0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm5
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm12
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm27
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,1,0,2]
+; AVX512DQ-NEXT:    vpermd %ymm5, %ymm19, %ymm11
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpandnq %ymm11, %ymm17, %ymm11
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm11, %zmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm17)
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm10
 ; AVX512DQ-NEXT:    vprold $16, %xmm10, %xmm11
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm12
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2]
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm26
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm12
-; AVX512DQ-NEXT:    vmovdqa 80(%rsi), %xmm13
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX512DQ-NEXT:    vmovdqa 80(%rsi), %xmm14
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vprold $16, %xmm13, %xmm13
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm12, %ymm0
+; AVX512DQ-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm13, %ymm0
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512DQ-NEXT:    vpermd 64(%rdx), %zmm18, %zmm10
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0))
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [5,5,0,6,6,0,7,7]
+; AVX512DQ-NEXT:    vpermd %ymm12, %ymm16, %ymm10
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,8,8,0,9,9,0,10]
+; AVX512DQ-NEXT:    vpermd %zmm15, %zmm18, %zmm13
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm10, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm21 & (zmm10 ^ zmm0))
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm6, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm5
-; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm13
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm5, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm5
-; AVX512DQ-NEXT:    vmovdqa 112(%rsi), %xmm12
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm13
+; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm13, %ymm13
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm13
+; AVX512DQ-NEXT:    vmovdqa 112(%rsi), %xmm14
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
-; AVX512DQ-NEXT:    vprold $16, %xmm12, %xmm12
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm5
-; AVX512DQ-NEXT:    vpermd %ymm5, %ymm18, %ymm7
-; AVX512DQ-NEXT:    vpandnq %ymm7, %ymm22, %ymm7
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm17
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19)
+; AVX512DQ-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm13, %ymm7
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[0,1,2,3]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [10,0,11,11,0,12,12,0]
+; AVX512DQ-NEXT:    vpermd %zmm15, %zmm22, %zmm13
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [13,13,0,14,14,0,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm15, %zmm23, %zmm15
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm13, %zmm15
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm13 & (zmm15 ^ zmm0))
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm7
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm7, %ymm7
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm7, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512DQ-NEXT:    vprold $16, %xmm7, %xmm12
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm14, %ymm5
-; AVX512DQ-NEXT:    vpermd %ymm14, %ymm16, %ymm6
-; AVX512DQ-NEXT:    vpandn %ymm6, %ymm15, %ymm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15)
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm8
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm8, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm8
+; AVX512DQ-NEXT:    vprold $16, %xmm8, %xmm7
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm14
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm7
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm12, %ymm7
+; AVX512DQ-NEXT:    vpermd %ymm12, %ymm19, %ymm8
+; AVX512DQ-NEXT:    vpandnq %ymm8, %ymm17, %ymm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm0 & zmm17)
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm6
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm6, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm6
-; AVX512DQ-NEXT:    vmovdqa 48(%rsi), %xmm7
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm8, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm8
+; AVX512DQ-NEXT:    vmovdqa 48(%rsi), %xmm9
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
-; AVX512DQ-NEXT:    vprold $16, %xmm7, %xmm7
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-NEXT:    vpermd %ymm8, %ymm18, %ymm6
-; AVX512DQ-NEXT:    vpandnq %ymm6, %ymm22, %ymm6
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm8, %ymm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19)
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-NEXT:    vprold $16, %xmm24, %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm1
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vprold $16, %xmm20, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm8, %ymm8
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-NEXT:    vpermd %zmm20, %zmm22, %zmm8
+; AVX512DQ-NEXT:    vpermd %zmm20, %zmm23, %zmm9
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm13 & (zmm8 ^ zmm0))
+; AVX512DQ-NEXT:    vprold $16, %xmm4, %xmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2],xmm9[3,4],xmm0[5],xmm9[6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vprold $16, %xmm24, %xmm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm25[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-NEXT:    vpermd (%rdx), %zmm18, %zmm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT:    vpermd %ymm5, %ymm16, %ymm1
+; AVX512DQ-NEXT:    vpermd %zmm20, %zmm18, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm0))
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rcx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 192(%rcx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 320(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 128(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 192(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 320(%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 256(%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -3501,143 +3537,151 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride3_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm24
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm11
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm16, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpandn %ymm3, %ymm15, %ymm3
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm5, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm5[0,1,2,3],zmm3[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm27
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,1,0,2]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm19, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpandnq %ymm11, %ymm17, %ymm11
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm11, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm17)
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm10
 ; AVX512DQ-FCP-NEXT:    vprold $16, %xmm10, %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm26
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rsi), %xmm13
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rsi), %xmm14
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm13, %ymm0
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512DQ-FCP-NEXT:    vpermd 64(%rdx), %zmm18, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [5,5,0,6,6,0,7,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm16, %ymm10
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,8,8,0,9,9,0,10]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm15, %zmm18, %zmm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm21 & (zmm10 ^ zmm0))
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rsi), %xmm12
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm13
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rsi), %xmm14
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm5
-; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm18, %ymm7
-; AVX512DQ-FCP-NEXT:    vpandnq %ymm7, %ymm22, %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19)
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm13, %ymm7
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [10,0,11,11,0,12,12,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm15, %zmm22, %zmm13
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [13,13,0,14,14,0,15,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm15, %zmm23, %zmm15
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm13, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm13 & (zmm15 ^ zmm0))
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm7, %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm5
-; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm16, %ymm6
-; AVX512DQ-FCP-NEXT:    vpandn %ymm6, %ymm15, %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15)
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm8
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm8, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm7
+; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm19, %ymm8
+; AVX512DQ-FCP-NEXT:    vpandnq %ymm8, %ymm17, %ymm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm0 & zmm17)
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm6, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rsi), %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rsi), %xmm9
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm18, %ymm6
-; AVX512DQ-FCP-NEXT:    vpandnq %ymm6, %ymm22, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm7
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm24, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm20, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm22, %zmm8
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm23, %zmm9
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm13 & (zmm8 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2],xmm9[3,4],xmm0[5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm24, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm25[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermd (%rdx), %zmm18, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm0))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 320(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 192(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 320(%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 256(%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 322d606538c54..a68684d948aac 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -349,27 +349,27 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm5
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX2-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
 ; AVX2-NEXT:    vmovq %xmm0, 32(%r9)
-; AVX2-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX2-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -377,26 +377,26 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-FP-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm2, %ymm5
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm1, %ymm5
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovq %xmm0, 32(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
@@ -404,26 +404,26 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm5
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
+; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm5
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovq %xmm0, 32(%r9)
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -432,25 +432,25 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    movq (%r8), %rax
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512-NEXT:    vpbroadcastq %rax, %ymm3
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm5
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3))
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm1, %ymm5
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (mem & (ymm1 ^ ymm3))
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
 ; AVX512-NEXT:    vmovq %xmm0, 32(%r9)
-; AVX512-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -459,24 +459,24 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    movq (%r8), %rax
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vpbroadcastq %rax, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm5
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm5
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (mem & (ymm1 ^ ymm3))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovq %xmm0, 32(%r9)
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -485,25 +485,25 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    movq (%r8), %rax
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpbroadcastq %rax, %ymm3
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm5
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3))
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (mem & (ymm1 ^ ymm3))
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovq %xmm0, 32(%r9)
-; AVX512DQ-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512DQ-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -512,24 +512,24 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    movq (%r8), %rax
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq %rax, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm5
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm5
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (mem & (ymm1 ^ ymm3))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, 32(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index 25bad7578c111..4524146afde6b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -404,10 +404,9 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -438,7 +437,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -471,7 +470,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -501,10 +500,9 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -538,7 +536,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7]
 ; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm5, %ymm3
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -569,10 +567,9 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -606,7 +603,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm5, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -819,8 +816,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm9[0,1,0,1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm10 = xmm10[0,1],xmm9[0],xmm10[3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm10, %ymm8
 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
@@ -1948,30 +1944,29 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512-NEXT:    vmovdqa (%r8), %ymm1
 ; AVX512-NEXT:    vmovdqa (%r9), %ymm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [5,0,14,6,0,15,7,0]
+; AVX512-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,13,2,3,14,5,6,15]
+; AVX512-NEXT:    vpermi2d %ymm6, %ymm8, %ymm7
 ; AVX512-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm2[2,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7]
-; AVX512-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
+; AVX512-NEXT:    vpermi2d %ymm6, %ymm8, %ymm9
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm1[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0]
-; AVX512-NEXT:    vpermi2d %ymm7, %ymm8, %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
-; AVX512-NEXT:    vpermi2d %zmm9, %zmm7, %zmm8
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm16
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm16
 ; AVX512-NEXT:    vmovdqa (%rcx), %xmm6
 ; AVX512-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -2039,44 +2034,44 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10]
 ; AVX512-FCP-NEXT:    vpermi2d %ymm9, %ymm11, %ymm12
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm11
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm14
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpermi2d %zmm14, %zmm12, %zmm13
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm13
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm14
 ; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm12
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
-; AVX512-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm15
+; AVX512-FCP-NEXT:    vpermi2d %ymm14, %ymm12, %ymm15
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm12
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [16,9,10,17,12,13,18,15]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm12, %zmm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm14, %zmm0
+; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm12, %zmm14
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm0
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0]
 ; AVX512-FCP-NEXT:    vpermi2d %ymm8, %ymm6, %ymm7
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11]
-; AVX512-FCP-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,17,2,3,18,5,6,19]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512-FCP-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14]
-; AVX512-FCP-NEXT:    vpermi2d %ymm6, %ymm7, %ymm9
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm6
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm10
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpermi2d %zmm9, %zmm6, %zmm7
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm6
+; AVX512-FCP-NEXT:    vpermi2d %ymm7, %ymm8, %ymm9
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [8,9,20,11,12,21,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm11
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11]
+; AVX512-FCP-NEXT:    vpermi2d %zmm9, %zmm7, %zmm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm8
 ; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm7
@@ -2086,20 +2081,20 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
 ; AVX512-FCP-NEXT:    vpermi2d %ymm8, %ymm7, %ymm9
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7]
-; AVX512-FCP-NEXT:    vpermi2d %ymm7, %ymm9, %ymm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [20,1,2,21,4,5,22,7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm1[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm16[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11]
+; AVX512-FCP-NEXT:    vpermi2d %zmm8, %zmm9, %zmm7
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0]
 ; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm1[4],ymm11[5],ymm1[5],ymm11[6],ymm1[6],ymm11[7],ymm1[7],ymm11[12],ymm1[12],ymm11[13],ymm1[13],ymm11[14],ymm1[14],ymm11[15],ymm1[15]
 ; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
@@ -2115,30 +2110,29 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm1
 ; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [5,0,14,6,0,15,7,0]
+; AVX512DQ-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,13,2,3,14,5,6,15]
+; AVX512DQ-NEXT:    vpermi2d %ymm6, %ymm8, %ymm7
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm2[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7]
-; AVX512DQ-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
+; AVX512DQ-NEXT:    vpermi2d %ymm6, %ymm8, %ymm9
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm1[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0]
-; AVX512DQ-NEXT:    vpermi2d %ymm7, %ymm8, %ymm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
-; AVX512DQ-NEXT:    vpermi2d %zmm9, %zmm7, %zmm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm16
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm16
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm6
 ; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -2206,44 +2200,44 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm9, %ymm11, %ymm12
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm11
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm14
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm14, %zmm12, %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm13
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm14
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm12
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm15
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm14, %ymm12, %ymm15
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [16,9,10,17,12,13,18,15]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3]
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm12, %zmm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm12, %zmm14
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm8, %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,17,2,3,18,5,6,19]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm6, %ymm7, %ymm9
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm10
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm9, %zmm6, %zmm7
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm7, %ymm8, %ymm9
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [8,9,20,11,12,21,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm11
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm9, %zmm7, %zmm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm8
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm7
@@ -2253,20 +2247,20 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm8, %ymm7, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm7, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [20,1,2,21,4,5,22,7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm1[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm16[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm8, %zmm9, %zmm7
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm1[4],ymm11[5],ymm1[5],ymm11[6],ymm1[6],ymm11[7],ymm1[7],ymm11[12],ymm1[12],ymm11[13],ymm1[13],ymm11[14],ymm1[14],ymm11[15],ymm1[15]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
@@ -4138,213 +4132,215 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-FCP-LABEL: store_i16_stride6_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $40, %rsp
+; AVX512-FCP-NEXT:    subq $72, %rsp
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm3
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm4, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm23
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm4
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm25
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,1,2,3,11,11,11,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm26
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm19 = [2,1,2,3,11,11,11,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm0
 ; AVX512-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm9
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [12,1,2,13,4,5,14,7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm3
-; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm6, %ymm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [8,21,10,11,20,13,14,23]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm24
-; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm7, %zmm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm21, %ymm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [8,21,10,11,20,13,14,23]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm27
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm20, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm27
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [2,2,2,3,10,9,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm9, %zmm10
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm11
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm12
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15]
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm10 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm28
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm11 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm29
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm4, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm10, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm5
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm0, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm3
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm4
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [2,2,2,3,10,9,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm10, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm15
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm8, %xmm28
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [0,1,0,1,10,10,10,10]
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm22
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm29
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm30
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm7, %zmm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm21
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm4
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm31
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm18
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm13
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm18, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm17
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm19, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm4
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermi2d %ymm3, %ymm1, %ymm21
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm31
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm20, %zmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm24
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm12
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [0,0,2,1,8,9,8,9]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm20, %zmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm20, %zmm6
 ; AVX512-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-FCP-NEXT:    kmovw %eax, %k2
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm5 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm10
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm6, %zmm5
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm6 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [16,9,10,17,12,13,18,15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm7, %zmm21, %zmm6
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm19, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vpermt2d %ymm7, %ymm19, %ymm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm18
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm11 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm11
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm4 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm4, %ymm19
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm20, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm11, %zmm9 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm8
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
+; AVX512-FCP-NEXT:    vpermi2d %ymm5, %ymm9, %ymm19
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm8[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm21, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm5
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm15[0,0,2,1,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,10,9]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm9, %xmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm19, %zmm19
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm20, %zmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm20, %zmm11
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm19, %zmm19
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm5[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm20, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm15
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[8],ymm15[8],ymm10[9],ymm15[9],ymm10[10],ymm15[10],ymm10[11],ymm15[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [1,1,1,1,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm13, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm12 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [0,9,2,3,8,5,6,11]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm20
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm21, %ymm20
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,9,20,11,12,21,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm10
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm2, %zmm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm20, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm12
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm15
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm0
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm7, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm6 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,9,2,3,8,5,6,11]
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm8
-; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm3, %ymm8
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [8,9,20,11,12,21,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm15, %zmm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [0,1,0,1,10,10,10,10]
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm20, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm14
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[1],ymm14[1],ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[8],ymm14[8],ymm8[9],ymm14[9],ymm8[10],ymm14[10],ymm8[11],ymm14[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm26, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm13
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm14
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm13 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11]
-; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm7, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm12, %zmm13 {%k1}
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm13, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm13, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm15, %zmm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm3, %zmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm1
+; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm4, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm1
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm21, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm0))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm3 & (zmm22 ^ zmm10))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm4 ^ ymm1))
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm19))
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm6))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm19))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm17))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm0 & (zmm11 ^ zmm21))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512-FCP-NEXT:    addq $40, %rsp
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm1 & (zmm11 ^ zmm18))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm24))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512-FCP-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 256(%rax)
+; AVX512-FCP-NEXT:    addq $72, %rsp
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -4574,213 +4570,215 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride6_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $40, %rsp
+; AVX512DQ-FCP-NEXT:    subq $72, %rsp
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm23
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm25
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,1,2,3,11,11,11,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm26
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm19 = [2,1,2,3,11,11,11,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm0
 ; AVX512DQ-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm9
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [12,1,2,13,4,5,14,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm3
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [8,21,10,11,20,13,14,23]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm24
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm7, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm21, %ymm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [8,21,10,11,20,13,14,23]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm27
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm20, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm27
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [2,2,2,3,10,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm9, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm12
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15]
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm10 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm28
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm11 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm29
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm4, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm10, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm5
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm4, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [2,2,2,3,10,9,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm10, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm15
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm8, %xmm28
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [0,1,0,1,10,10,10,10]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm22
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm29
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm30
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm7, %zmm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm4
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm18
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm18, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm17
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm19, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm4
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm3, %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm31
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm12
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [0,0,2,1,8,9,8,9]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm20, %zmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm20, %zmm6
 ; AVX512DQ-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm5 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm10
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm6, %zmm5
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm2, %zmm6 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [16,9,10,17,12,13,18,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm21, %zmm6
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm19, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm7, %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm11 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm4 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm4, %ymm19
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm20, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm11, %zmm9 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm8
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm5, %ymm9, %ymm19
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm8[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm21, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm15[0,0,2,1,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,10,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm9, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm19, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm20, %zmm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm20, %zmm11
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm19, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm5[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm20, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm15
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[8],ymm15[8],ymm10[9],ymm15[9],ymm10[10],ymm15[10],ymm10[11],ymm15[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [1,1,1,1,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm13, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [0,9,2,3,8,5,6,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm20
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm21, %ymm20
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,9,20,11,12,21,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm2, %zmm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm20, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm7
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm7, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,9,2,3,8,5,6,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm8
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm3, %ymm8
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [8,9,20,11,12,21,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm15, %zmm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [0,1,0,1,10,10,10,10]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm20, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm14
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[1],ymm14[1],ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[8],ymm14[8],ymm8[9],ymm14[9],ymm8[10],ymm14[10],ymm8[11],ymm14[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm26, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm13
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm14
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm13 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm7, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm12, %zmm13 {%k1}
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm13, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm15, %zmm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm4, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm21, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm3 & (zmm22 ^ zmm10))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm4 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm19))
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm6))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm19))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm17))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm0 & (zmm11 ^ zmm21))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ mem))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512DQ-FCP-NEXT:    addq $40, %rsp
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm1 & (zmm11 ^ zmm18))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm24))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ mem))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 256(%rax)
+; AVX512DQ-FCP-NEXT:    addq $72, %rsp
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -8749,417 +8747,426 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-FCP-LABEL: store_i16_stride6_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $1320, %rsp # imm = 0x528
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512-FCP-NEXT:    subq $1448, %rsp # imm = 0x5A8
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm13
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm4[4],ymm13[5],ymm4[5],ymm13[6],ymm4[6],ymm13[7],ymm4[7],ymm13[12],ymm4[12],ymm13[13],ymm4[13],ymm13[14],ymm4[14],ymm13[15],ymm4[15]
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm22, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm22, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm5
 ; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [2,1,2,3,11,11,11,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
 ; AVX512-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm5
+; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm3
-; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm21, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm21, %ymm3
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm24, %zmm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm2
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm24, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [2,2,2,3,10,9,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm22, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm22, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm5
 ; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm5
+; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm21, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm24, %zmm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm3
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm24, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm26, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm26, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm5
 ; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%rsp) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm3
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,2,1,8,9,8,9]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm3
 ; AVX512-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-FCP-NEXT:    kmovw %eax, %k2
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa %ymm3, %ymm0
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm3
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm29 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm29, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm29, %ymm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,8,8,10,9]
 ; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm0
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm26, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm26, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm5
 ; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm3
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm3
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm29, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm29, %ymm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm0
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm22, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm5
 ; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm3
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm22, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm5
+; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm3
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm31
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm21, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm21, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm3
 ; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm24, %zmm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm5
+; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm22, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm20
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm31
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm23, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm21
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm14
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm13
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm22, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm23, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm12
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm21
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm1
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm24, %zmm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm21, %zmm21
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm3
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm20
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm18
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm19
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm26, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm2
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm18
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm17
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k2}
 ; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm11[2,1,3,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm2
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
 ; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm29, %ymm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm22
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm5
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm26, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm7
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm27, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm7 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-FCP-NEXT:    vpermi2d %ymm3, %ymm7, %ymm29
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm28, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm15
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm3
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm15[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm29, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm17
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm26, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm14
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero
+; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm2, %ymm29
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm14[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm1
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,9,2,3,8,5,6,11]
-; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [8,9,20,11,12,21,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm26
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,1,0,1,10,10,10,10]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm29, %zmm23
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm10[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
-; AVX512-FCP-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm6 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[2],mem[2],ymm6[3],mem[3],ymm6[8],mem[8],ymm6[9],mem[9],ymm6[10],mem[10],ymm6[11],mem[11]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm6 {%k1}
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm13 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm3
-; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm5, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm29
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11]
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm1
+; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm24, %ymm1
 ; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm27
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [8,9,20,11,12,21,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm27, %zmm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm1, %zmm25
 ; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm4
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [0,1,0,1,10,10,10,10]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[2],mem[2],ymm10[3],mem[3],ymm10[8],mem[8],ymm10[9],mem[9],ymm10[10],mem[10],ymm10[11],mem[11]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm10 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm3
-; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm5, %ymm3
-; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm10
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm13
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm24, %ymm2
+; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm29
+; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm11
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm12
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm24, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm12
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11]
-; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm28, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm11, %zmm12 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm10
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm9
-; AVX512-FCP-NEXT:    vpermi2d %ymm9, %ymm12, %ymm5
-; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm25, %zmm12
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm5, %zmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm15, %xmm4
-; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm5))
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm12 {%k1}
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm2
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm24, %ymm2
+; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm27, %zmm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm30, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm11
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm26, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm9
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm9 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[2],mem[2],ymm9[3],mem[3],ymm9[8],mem[8],ymm9[9],mem[9],ymm9[10],mem[10],ymm9[11],mem[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm2
+; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm9, %ymm24
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm2
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm27, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm6
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm9))
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm1))
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 256(%rax)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm3))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 256(%rax)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm8 & (zmm0 ^ zmm29))
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 448(%rax)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm1 & (zmm6 ^ zmm27))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 640(%rax)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm26))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm8 & (zmm4 ^ zmm25))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 640(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm23))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm22))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 192(%rax)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm23))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm22))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 192(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm21))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 320(%rax)
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 576(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem))
@@ -9168,7 +9175,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem))
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 704(%rax)
-; AVX512-FCP-NEXT:    addq $1320, %rsp # imm = 0x528
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm24 ^ ymm2))
+; AVX512-FCP-NEXT:    vmovdqa %ymm6, 96(%rax)
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, 64(%rax)
+; AVX512-FCP-NEXT:    addq $1448, %rsp # imm = 0x5A8
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -9658,417 +9668,426 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride6_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $1320, %rsp # imm = 0x528
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512DQ-FCP-NEXT:    subq $1448, %rsp # imm = 0x5A8
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm4[4],ymm13[5],ymm4[5],ymm13[6],ymm4[6],ymm13[7],ymm4[7],ymm13[12],ymm4[12],ymm13[13],ymm4[13],ymm13[14],ymm4[14],ymm13[15],ymm4[15]
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm22, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm22, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [2,1,2,3,11,11,11,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
 ; AVX512DQ-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm3
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm21, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm21, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm24, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm24, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [2,2,2,3,10,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm22, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm22, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm21, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm24, %zmm1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm24, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm26, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%rsp) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,2,1,8,9,8,9]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm3
 ; AVX512DQ-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
 ; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm29 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm29, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm29, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,8,8,10,9]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm26, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm29, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm29, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm22, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm31
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm21, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm21, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm24, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm31
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm13
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm22, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm12
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm24, %zmm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm21, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm3
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm19
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm26, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm17
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k2}
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm11[2,1,3,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
 ; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm29, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm5
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm26, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm27, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm2, %zmm7 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm3, %ymm7, %ymm29
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm28, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm15[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm29, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm17
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm26, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm14
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm2, %ymm29
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm14[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm7
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm4 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,9,2,3,8,5,6,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [8,9,20,11,12,21,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm26
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,1,0,1,10,10,10,10]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm29, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm10[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm6 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[2],mem[2],ymm6[3],mem[3],ymm6[8],mem[8],ymm6[9],mem[9],ymm6[10],mem[10],ymm6[11],mem[11]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm13 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm29
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm24, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm27
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [8,9,20,11,12,21,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm27, %zmm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm1, %zmm25
 ; AVX512DQ-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [0,1,0,1,10,10,10,10]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[2],mem[2],ymm10[3],mem[3],ymm10[8],mem[8],ymm10[9],mem[9],ymm10[10],mem[10],ymm10[11],mem[11]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm28, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm10
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm13
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm24, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm29
+; AVX512DQ-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm12
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm24, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm12
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm28, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm11, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm9
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm9, %ymm12, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm25, %zmm12
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm15, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm2, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm24, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm27, %zmm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm30, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm11
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm26, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm9
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm9 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[2],mem[2],ymm9[3],mem[3],ymm9[8],mem[8],ymm9[9],mem[9],ymm9[10],mem[10],ymm9[11],mem[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm2
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm9, %ymm24
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm2
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm27, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm9))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm1))
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm3))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm8 & (zmm0 ^ zmm29))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm1 & (zmm6 ^ zmm27))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 640(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm26))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm8 & (zmm4 ^ zmm25))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 640(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm23))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm22))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm23))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm22))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 192(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm21))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ mem))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ mem))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ mem))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ mem))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ mem))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 576(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem))
@@ -10077,7 +10096,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 704(%rax)
-; AVX512DQ-FCP-NEXT:    addq $1320, %rsp # imm = 0x528
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm24 ^ ymm2))
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, 96(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, 64(%rax)
+; AVX512DQ-FCP-NEXT:    addq $1448, %rsp # imm = 0x5A8
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 5aa7c055d408e..003dab11a85ba 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -441,48 +441,48 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,1,2,1]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6],xmm4[7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7]
-; AVX-NEXT:    vmovddup {{.*#+}} xmm6 = mem[0,0]
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm6[1,1,1,1]
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4],xmm3[5,6,7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,2,3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[0,1,0,2,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7]
-; AVX-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7]
-; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX-NEXT:    vpackusdw %xmm8, %xmm7, %xmm7
-; AVX-NEXT:    vpackusdw %xmm7, %xmm7, %xmm7
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6],xmm5[7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,2,3,6,7,u,u,u,u,u,u,4,5,12,13]
-; AVX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[0,1,1,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[3,1,2,1]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[2,0,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6],xmm6[7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm8 = mem[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm8[1,1,1,1]
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm9[0]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3,4],xmm5[5,6,7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm3[0,1,0,2,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7]
+; AVX-NEXT:    vpxor %xmm9, %xmm9, %xmm9
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1,2,3],xmm4[4],xmm9[5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
+; AVX-NEXT:    vpackusdw %xmm9, %xmm4, %xmm4
+; AVX-NEXT:    vpackusdw %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6],xmm7[7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5,6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,2,3,6,7,u,u,u,u,u,u,4,5,12,13]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm6[5],xmm0[6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5],xmm1[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm8[5],xmm0[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5],xmm2[6,7]
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rax)
-; AVX-NEXT:    vmovdqa %xmm5, (%rax)
-; AVX-NEXT:    vmovq %xmm4, 48(%rax)
-; AVX-NEXT:    vmovdqa %xmm3, 32(%rax)
+; AVX-NEXT:    vmovdqa %xmm4, (%rax)
+; AVX-NEXT:    vmovq %xmm6, 48(%rax)
+; AVX-NEXT:    vmovdqa %xmm5, 32(%rax)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: store_i16_stride7_vf4:
@@ -2197,8 +2197,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm4[0,1,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm12[0,0,0,0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
@@ -2222,8 +2221,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm12[2],xmm3[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
 ; AVX-NEXT:    vandnps %ymm1, %ymm3, %ymm1
@@ -4283,21 +4281,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX-LABEL: store_i16_stride7_vf32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $584, %rsp # imm = 0x248
+; AVX-NEXT:    subq $568, %rsp # imm = 0x238
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovdqa 16(%rsi), %xmm12
+; AVX-NEXT:    vmovdqa 16(%rsi), %xmm9
+; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 48(%rsi), %xmm1
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm12
 ; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 48(%rsi), %xmm9
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm10
-; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX-NEXT:    vandnps %ymm1, %ymm11, %ymm2
+; AVX-NEXT:    vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX-NEXT:    vandnps %ymm1, %ymm10, %ymm2
 ; AVX-NEXT:    vmovdqa 48(%rdx), %xmm4
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[2,2,2,2]
 ; AVX-NEXT:    vmovdqa 48(%rcx), %xmm3
@@ -4307,14 +4306,14 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX-NEXT:    vandps %ymm5, %ymm11, %ymm5
+; AVX-NEXT:    vandps %ymm5, %ymm10, %ymm5
 ; AVX-NEXT:    vorps %ymm2, %ymm5, %ymm6
 ; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm5
 ; AVX-NEXT:    vmovdqa 48(%r8), %xmm2
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2,3,4,5,6],xmm7[7]
-; AVX-NEXT:    vmovdqa 48(%r9), %xmm13
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vmovdqa 48(%r9), %xmm11
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa 48(%rax), %xmm5
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3]
@@ -4322,17 +4321,17 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[1,1,1,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7]
-; AVX-NEXT:    vpsrld $16, %xmm13, %xmm7
+; AVX-NEXT:    vpsrld $16, %xmm11, %xmm7
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7]
 ; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpsrlq $48, %xmm3, %xmm6
 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm4[1],xmm6[1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX-NEXT:    vandnps %ymm1, %ymm11, %ymm1
+; AVX-NEXT:    vandnps %ymm1, %ymm10, %ymm1
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3]
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm6, %ymm0
-; AVX-NEXT:    vandps %ymm0, %ymm11, %ymm0
+; AVX-NEXT:    vandps %ymm0, %ymm10, %ymm0
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,2,3,3]
@@ -4342,13 +4341,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3,4,5,6],xmm6[7]
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,6,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,5,6,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6],xmm1[7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
@@ -4365,21 +4364,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9]
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX-NEXT:    vandnps %ymm0, %ymm11, %ymm0
-; AVX-NEXT:    vandps %ymm1, %ymm11, %ymm1
+; AVX-NEXT:    vandnps %ymm0, %ymm10, %ymm0
+; AVX-NEXT:    vandps %ymm1, %ymm10, %ymm1
 ; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vmovdqa 16(%r8), %xmm7
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 16(%r9), %xmm11
-; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; AVX-NEXT:    vmovdqa 16(%r9), %xmm10
+; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
 ; AVX-NEXT:    vmovdqa 16(%rax), %xmm8
 ; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm8[2,2,3,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
 ; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7]
@@ -4389,65 +4388,65 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vandnps %ymm1, %ymm6, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 32(%rsi), %xmm11
+; AVX-NEXT:    vmovdqa 32(%rsi), %xmm10
 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX-NEXT:    vpsrld $16, %xmm11, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm10, %xmm0
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovdqa 32(%rcx), %xmm7
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vmovdqa 32(%rdx), %xmm10
-; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm10[2,2,2,2]
+; AVX-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm8[2,2,2,2]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
 ; AVX-NEXT:    vmovdqa %xmm7, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,6]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm9[0,1,2,3,4,5,6,6]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm15, %ymm6
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
 ; AVX-NEXT:    vandnps %ymm0, %ymm7, %ymm0
 ; AVX-NEXT:    vandps %ymm7, %ymm6, %ymm6
-; AVX-NEXT:    vorps %ymm0, %ymm6, %ymm7
-; AVX-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vorps %ymm0, %ymm6, %ymm15
+; AVX-NEXT:    vmovdqa 32(%r8), %xmm7
+; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 32(%r9), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 32(%rax), %xmm6
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm6[3],xmm15[4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm0[0,2],xmm6[1,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm14, %ymm14
-; AVX-NEXT:    vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX-NEXT:    vandps %ymm7, %ymm15, %ymm7
-; AVX-NEXT:    vandnps %ymm14, %ymm15, %ymm14
-; AVX-NEXT:    vorps %ymm7, %ymm14, %ymm7
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm6[3],xmm14[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm0[0,2],xmm6[1,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm13, %ymm13
+; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX-NEXT:    vandps %ymm7, %ymm15, %ymm14
+; AVX-NEXT:    vandnps %ymm13, %ymm7, %ymm13
+; AVX-NEXT:    vorps %ymm13, %ymm14, %ymm7
 ; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpsrld $16, %xmm3, %xmm7
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; AVX-NEXT:    vpsrld $16, %xmm3, %xmm13
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm7, %ymm3
-; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpsrld $16, %xmm9, %xmm4
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm13, %ymm3
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm4
+; AVX-NEXT:    vpsrld $16, %xmm7, %xmm4
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm13, %ymm4
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
 ; AVX-NEXT:    vandnps %ymm3, %ymm7, %ymm3
 ; AVX-NEXT:    vandps %ymm7, %ymm4, %ymm4
 ; AVX-NEXT:    vorps %ymm3, %ymm4, %ymm3
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3]
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4461,11 +4460,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpsrld $16, %xmm1, %xmm2
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm8[0,0,1,1]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
+; AVX-NEXT:    vmovdqa %xmm8, %xmm11
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm9[0,0,1,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
@@ -4479,72 +4479,69 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm6[0,0,0,0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
 ; AVX-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
 ; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX-NEXT:    vmovdqa (%rcx), %xmm7
-; AVX-NEXT:    vpsrld $16, %xmm7, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
-; AVX-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX-NEXT:    vmovdqa (%rdx), %xmm7
+; AVX-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX-NEXT:    vpsrld $16, %xmm3, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm10, %ymm0
+; AVX-NEXT:    vmovdqa (%rsi), %xmm13
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm14
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,1,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm11, %ymm11
-; AVX-NEXT:    vandnps %ymm2, %ymm5, %ymm2
-; AVX-NEXT:    vandps %ymm5, %ymm11, %ymm5
-; AVX-NEXT:    vorps %ymm2, %ymm5, %ymm15
-; AVX-NEXT:    vmovdqa (%r8), %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm10, %ymm10
+; AVX-NEXT:    vandnps %ymm0, %ymm5, %ymm0
+; AVX-NEXT:    vandps %ymm5, %ymm10, %ymm5
+; AVX-NEXT:    vorps %ymm0, %ymm5, %ymm15
+; AVX-NEXT:    vmovdqa (%r8), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa (%r9), %xmm5
 ; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa (%rax), %xmm2
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,1,0,1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm12[5],xmm0[6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm5[0,1,0,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm2[0,0,0,0]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm11, %ymm1
-; AVX-NEXT:    vandps %ymm4, %ymm15, %ymm11
-; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
-; AVX-NEXT:    vorps %ymm1, %ymm11, %ymm1
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm9[2,2,2,2]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
-; AVX-NEXT:    vandnps %ymm0, %ymm12, %ymm0
-; AVX-NEXT:    vandps %ymm1, %ymm12, %ymm1
+; AVX-NEXT:    vinsertps {{.*#+}} xmm12 = xmm12[0,1,2],xmm2[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
+; AVX-NEXT:    vandps %ymm4, %ymm15, %ymm12
+; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
+; AVX-NEXT:    vorps %ymm0, %ymm12, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpsrld $16, %xmm13, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,1,0,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm0
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm7[2,2,2,2]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm13[6],xmm12[7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm1, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT:    vandnps %ymm0, %ymm14, %ymm0
+; AVX-NEXT:    vandps %ymm1, %ymm14, %ymm1
 ; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm2[1,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[1,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
@@ -4552,194 +4549,190 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vpsrld $16, %xmm2, %xmm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,6,6]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vpsrld $16, %xmm9, %xmm1
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpsrld $16, %xmm5, %xmm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
-; AVX-NEXT:    vandnps %ymm0, %ymm3, %ymm0
-; AVX-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
+; AVX-NEXT:    vandnps %ymm0, %ymm5, %ymm0
+; AVX-NEXT:    vandps %ymm5, %ymm1, %ymm1
 ; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5],xmm1[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm13[1,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
-; AVX-NEXT:    vandps %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    vandnps %ymm1, %ymm3, %ymm1
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm10[0,1,0,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm9[0,2],xmm10[1,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
+; AVX-NEXT:    vandps %ymm5, %ymm0, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm5, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm0
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1]
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm8[1],xmm0[1]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
 ; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vpsrld $16, %xmm2, %xmm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
-; AVX-NEXT:    vandps %ymm5, %ymm0, %ymm0
-; AVX-NEXT:    vandnps %ymm3, %ymm5, %ymm3
-; AVX-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm10[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm5[0],xmm8[1],xmm5[2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm8, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
+; AVX-NEXT:    vandps %ymm0, %ymm8, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm8, %ymm1
+; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa %xmm10, %xmm4
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[2,2,3,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm1, %ymm1
+; AVX-NEXT:    vandnps %ymm0, %ymm14, %ymm0
+; AVX-NEXT:    vandps %ymm1, %ymm14, %ymm1
+; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm10[2,2,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX-NEXT:    vandnps %ymm0, %ymm12, %ymm0
-; AVX-NEXT:    vandps %ymm3, %ymm12, %ymm3
-; AVX-NEXT:    vorps %ymm0, %ymm3, %ymm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm8 = xmm8[0],xmm6[2],xmm8[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm8, %ymm1
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm15 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
 ; AVX-NEXT:    vandnps %ymm0, %ymm15, %ymm0
-; AVX-NEXT:    vandps %ymm3, %ymm15, %ymm3
-; AVX-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; AVX-NEXT:    vandps %ymm1, %ymm15, %ymm1
+; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm0
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1]
-; AVX-NEXT:    vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = mem[0,0,1,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX-NEXT:    vpsrlq $48, %xmm12, %xmm0
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm11[1],xmm0[1]
+; AVX-NEXT:    vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = mem[0,0,1,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm9
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
 ; AVX-NEXT:    vandnps %ymm0, %ymm8, %ymm0
-; AVX-NEXT:    vandps %ymm3, %ymm8, %ymm3
-; AVX-NEXT:    vorps %ymm0, %ymm3, %ymm3
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm14 = mem[0,1,2],xmm2[3],mem[4,5,6,7]
-; AVX-NEXT:    vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = mem[0,1,0,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[0,0,0,0]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm10[6,7]
-; AVX-NEXT:    vpsrld $16, %xmm1, %xmm10
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
+; AVX-NEXT:    vandps %ymm8, %ymm9, %ymm9
+; AVX-NEXT:    vorps %ymm0, %ymm9, %ymm11
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = mem[0,1,2],xmm12[3],mem[4,5,6,7]
+; AVX-NEXT:    vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm9 = mem[0,1,0,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,0,0]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm12[6,7]
+; AVX-NEXT:    vpsrld $16, %xmm2, %xmm12
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX-NEXT:    vandps %ymm6, %ymm3, %ymm3
-; AVX-NEXT:    vandnps %ymm5, %ymm6, %ymm5
-; AVX-NEXT:    vorps %ymm5, %ymm3, %ymm5
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
-; AVX-NEXT:    vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm10, %ymm3
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpslldq {{.*#+}} xmm10 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[2,2,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm10, %ymm10
-; AVX-NEXT:    vandnps %ymm3, %ymm12, %ymm3
-; AVX-NEXT:    vandps %ymm12, %ymm10, %ymm10
-; AVX-NEXT:    vorps %ymm3, %ymm10, %ymm3
-; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
+; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm6, %ymm6
+; AVX-NEXT:    vmovaps {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX-NEXT:    vandps %ymm9, %ymm11, %ymm11
+; AVX-NEXT:    vandnps %ymm6, %ymm9, %ymm6
+; AVX-NEXT:    vorps %ymm6, %ymm11, %ymm6
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm12, %ymm11
+; AVX-NEXT:    vpslldq {{.*#+}} xmm12 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm4[2,2,3,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm12, %ymm12
+; AVX-NEXT:    vandnps %ymm11, %ymm14, %ymm11
+; AVX-NEXT:    vandps %ymm14, %ymm12, %ymm12
+; AVX-NEXT:    vorps %ymm11, %ymm12, %ymm11
+; AVX-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm11, %ymm10
-; AVX-NEXT:    vandnps %ymm3, %ymm15, %ymm3
-; AVX-NEXT:    vandps %ymm15, %ymm10, %ymm10
-; AVX-NEXT:    vorps %ymm3, %ymm10, %ymm3
-; AVX-NEXT:    vpsrlq $48, %xmm7, %xmm7
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1]
-; AVX-NEXT:    vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = mem[0,0,1,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm7, %ymm7
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm13 = xmm13[0],xmm1[2],xmm13[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
+; AVX-NEXT:    vandnps %ymm11, %ymm15, %ymm11
+; AVX-NEXT:    vandps %ymm15, %ymm12, %ymm12
+; AVX-NEXT:    vorps %ymm11, %ymm12, %ymm11
+; AVX-NEXT:    vpsrlq $48, %xmm3, %xmm3
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm3[1]
+; AVX-NEXT:    vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = mem[0,0,1,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm3, %ymm3
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm4, %ymm4
-; AVX-NEXT:    vandnps %ymm7, %ymm8, %ymm7
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm4, %ymm4
+; AVX-NEXT:    vandnps %ymm3, %ymm8, %ymm3
 ; AVX-NEXT:    vandps %ymm4, %ymm8, %ymm4
-; AVX-NEXT:    vorps %ymm7, %ymm4, %ymm4
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = mem[0,1,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm13[0,0,0,0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
-; AVX-NEXT:    vpsrld $16, %xmm2, %xmm8
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX-NEXT:    vandps %ymm6, %ymm4, %ymm2
-; AVX-NEXT:    vandnps %ymm1, %ymm6, %ymm1
-; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX-NEXT:    vorps %ymm3, %ymm4, %ymm3
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = mem[0,1,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm10[0,0,0,0]
+; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm7
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX-NEXT:    vandps %ymm3, %ymm9, %ymm3
+; AVX-NEXT:    vandnps %ymm2, %ymm9, %ymm2
+; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovaps %ymm1, 96(%rax)
-; AVX-NEXT:    vmovaps %ymm3, 64(%rax)
-; AVX-NEXT:    vmovaps %ymm5, 320(%rax)
+; AVX-NEXT:    vmovaps %ymm2, 96(%rax)
+; AVX-NEXT:    vmovaps %ymm11, 64(%rax)
+; AVX-NEXT:    vmovaps %ymm6, 320(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, 288(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, 192(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, 128(%rax)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm1, (%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -4754,10 +4747,10 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovaps %xmm1, 416(%rax)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm1, 432(%rax)
-; AVX-NEXT:    vmovdqa %xmm14, 384(%rax)
+; AVX-NEXT:    vmovdqa %xmm5, 384(%rax)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 400(%rax)
-; AVX-NEXT:    addq $584, %rsp # imm = 0x248
+; AVX-NEXT:    addq $568, %rsp # imm = 0x238
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -5859,1245 +5852,1205 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride7_vf32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $680, %rsp # imm = 0x2A8
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm27
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm8
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm14, %ymm8, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm9
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm12, %ymm9, %ymm0
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512-NEXT:    vmovdqa (%r8), %ymm4
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm17
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm2
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm10
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3]
-; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm10
-; AVX512-NEXT:    vpshufb %ymm12, %ymm10, %ymm3
-; AVX512-NEXT:    vpshufb %ymm15, %ymm2, %ymm4
-; AVX512-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm15
-; AVX512-NEXT:    vpshufb %ymm13, %ymm12, %ymm3
-; AVX512-NEXT:    vpshufb %ymm14, %ymm15, %ymm4
-; AVX512-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm13
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm14
-; AVX512-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    subq $632, %rsp # imm = 0x278
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm6, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm7, %ymm2
+; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
-; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm26
-; AVX512-NEXT:    vmovdqa (%r9), %xmm4
-; AVX512-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0]
-; AVX512-NEXT:    vpermi2d %zmm7, %zmm6, %zmm25
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm0
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa (%rax), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm6
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm21
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm4
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm4, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm9
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm5, %ymm9, %ymm2
+; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm28
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm29
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm30
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm31
-; AVX512-NEXT:    vprold $16, %ymm13, %ymm4
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm14
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm2, %ymm14, %ymm12
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm0, %ymm10, %ymm13
+; AVX512-NEXT:    vpor %ymm12, %ymm13, %ymm12
+; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512-NEXT:    vpshufb %ymm3, %ymm15, %ymm3
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm12
+; AVX512-NEXT:    vpshufb %ymm5, %ymm12, %ymm5
+; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm29
+; AVX512-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm3
 ; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm3, %ymm3
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm13
+; AVX512-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
+; AVX512-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa (%r9), %ymm3
+; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512-NEXT:    vmovdqa (%r8), %ymm2
+; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3]
+; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
+; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
+; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0]
+; AVX512-NEXT:    vpermi2d %zmm8, %zmm5, %zmm28
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7,8,9],ymm8[10],ymm5[11,12],ymm8[13],ymm5[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7,8,9],ymm5[10],ymm8[11,12],ymm5[13],ymm8[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2]
+; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm10[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7,8,9,10],ymm8[11],ymm5[12,13],ymm8[14],ymm5[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
+; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7,8,9],ymm8[10],ymm5[11,12],ymm8[13],ymm5[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
 ; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm19
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm5
-; AVX512-NEXT:    vprold $16, %xmm5, %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
-; AVX512-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm4
-; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm18
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512-NEXT:    vprold $16, %ymm14, %ymm5
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm16
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[16,17,u,u]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm12
+; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm4[2,1,3,3]
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm6
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm29[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm26 = ymm5[2,1,3,2]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm9
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512-NEXT:    vpermd %zmm2, %zmm1, %zmm27
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm1
-; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
-; AVX512-NEXT:    vpshufb %xmm4, %xmm6, %xmm14
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vprold $16, %xmm2, %xmm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
 ; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm23
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm22
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
-; AVX512-NEXT:    vprold $16, %ymm16, %ymm1
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15]
-; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512-NEXT:    vprold $16, %xmm2, %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm1
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm31
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm1
+; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm10
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm25
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm22
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm29[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512-NEXT:    vmovdqa %ymm9, %ymm5
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm17[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512-NEXT:    vprold $16, %ymm5, %ymm3
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm17[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7,8,9],ymm3[10],ymm13[11,12],ymm3[13],ymm13[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm16[2,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3]
-; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm6 = mem[2,1,3,2]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm5 = mem[2,2,2,3]
-; AVX512-NEXT:    vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[0,2,2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
-; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm0 = mem[2,1,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3]
-; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm15 = mem[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm28
-; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm29
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm28))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm1, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm1 ^ (zmm30 & (zmm9 ^ zmm1))
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
-; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[2,3,3,3,6,7,7,7]
-; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm3 = mem[0,0,2,1]
-; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm10 = mem[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm11[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm8[2,1,3,3]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm1))
+; AVX512-NEXT:    vmovdqa64 (%rax), %zmm2
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [11,0,0,11,0,0,0,12]
+; AVX512-NEXT:    vpermd %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,0,12,13,0,0,13,15]
+; AVX512-NEXT:    vpermd %zmm2, %zmm17, %zmm17
+; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm1, %zmm17
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm1 = zmm1[0,1,2,3],mem[0,1,2,3]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm17 ^ (mem & (zmm1 ^ zmm17))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm8))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm18, %zmm11, %zmm17
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm18 & (zmm17 ^ zmm8))
+; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm26, %zmm13
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm17))
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15]
+; AVX512-NEXT:    vpermd %zmm2, %zmm8, %zmm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm13))
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm13 # 32-byte Folded Reload
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm10[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm23[2,2,2,3]
+; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm11 = mem[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,1,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm17 = ymm30[2,3,3,3,6,7,7,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm23 = ymm27[0,0,2,1]
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm10
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
-; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm12 = mem[0,0,1,1]
-; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm14 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm17 = mem[2,2,2,3]
-; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm18 = mem[2,1,3,2]
-; AVX512-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm19 = mem[2,2,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm31[0,0,1,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm25 = ymm25[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm26 = ymm22[0,0,2,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm27 = ymm21[2,1,3,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm29[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm20[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm19[0,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm5))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm13 ^ (zmm21 & (zmm0 ^ zmm13))
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm3 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm5, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm18 & (zmm3 ^ zmm0))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm11[0,0,0,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm16, %zmm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm17[2,1,3,2]
 ; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm2 & (zmm4 ^ zmm3))
-; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm2
-; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm18, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 | (zmm3 & mem)
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm4))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm26 ^ (mem & (zmm3 ^ zmm26))
-; AVX512-NEXT:    vpbroadcastd (%rax), %ymm4
-; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm5
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm25))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0))
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm23, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm24, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm21 & (zmm3 ^ zmm0))
+; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm0
+; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3))
+; AVX512-NEXT:    vinserti64x4 $1, %ymm26, %zmm25, %zmm3
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm5 ^ (mem & (zmm3 ^ zmm5))
+; AVX512-NEXT:    vpbroadcastd (%rax), %ymm5
+; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm28))
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm3
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512-NEXT:    vpermd (%rax), %zmm6, %zmm6
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm27, 384(%rax)
-; AVX512-NEXT:    addq $680, %rsp # imm = 0x2A8
+; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm27, %zmm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm3))
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm3
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512-NEXT:    vpermd %zmm2, %zmm7, %zmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm10))
+; AVX512-NEXT:    vmovdqa64 %zmm2, 128(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm5, (%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm1, 320(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 256(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm4, 192(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm12, 64(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm8, 384(%rcx)
+; AVX512-NEXT:    addq $632, %rsp # imm = 0x278
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride7_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm12
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm16
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    subq $184, %rsp
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm3
 ; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm11
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm10
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm11, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm15
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm15, %ymm5
-; AVX512-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm5, %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm9
-; AVX512-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm14
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm14, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm6
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm23
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm9
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm9, %ymm22
+; AVX512-FCP-NEXT:    vporq %ymm6, %ymm7, %ymm25
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm4
+; AVX512-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm21
+; AVX512-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm4
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm14
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm25
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm20
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm15
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm3, %zmm24
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm1, %zmm23
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm1
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [0,2,2,3,10,0,11,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm17
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,2,2,3,8,0,9,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm18
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm24[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm8
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm0, %zmm21
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,0,8,8,9]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm15
-; AVX512-FCP-NEXT:    vpermi2q %zmm15, %zmm11, %zmm22
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm19
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,0,8,8,9]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm13
+; AVX512-FCP-NEXT:    vpermi2q %zmm13, %zmm10, %zmm20
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm0, %xmm0
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3,4],xmm8[5],xmm1[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm11, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa %xmm15, %xmm11
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm1
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm8, %zmm1, %zmm26
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,1,1,0,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm28
+; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm26
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm1
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,1,1,0,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm29
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermi2q %zmm7, %zmm6, %zmm27
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm29
-; AVX512-FCP-NEXT:    vprold $16, %ymm5, %ymm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8,9,10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm27
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm10, %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm6
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm5, %zmm6, %zmm28
+; AVX512-FCP-NEXT:    vprold $16, %ymm15, %ymm5
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm15, %ymm10
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7,8,9,10],ymm4[11],ymm10[12,13],ymm4[14],ymm10[15]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10]
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm31
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm18
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
+; AVX512-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm31
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm12[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm16
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm23[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm4
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm22[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [0,1,0,3,10,10,11,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm25, %zmm30
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm0
 ; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm8
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm30
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm25[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm20, %zmm9
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
-; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm19, %ymm4
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm14[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm20, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm25[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7,8,9],ymm12[10],ymm7[11,12],ymm12[13],ymm7[14,15]
-; AVX512-FCP-NEXT:    vprold $16, %ymm1, %ymm12
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm25[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3],xmm12[4],xmm14[5,6],xmm12[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm14, %zmm12
-; AVX512-FCP-NEXT:    vprold $16, %xmm3, %xmm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm1[2],xmm13[3,4],xmm1[5],xmm13[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm15
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm13, %ymm6
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3,4],ymm7[5],ymm10[6,7,8,9],ymm7[10],ymm10[11,12],ymm7[13],ymm10[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm24 ^ (zmm1 & (zmm0 ^ zmm24))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm0))
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7]
-; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm15))
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm26 ^ (zmm6 & (zmm12 ^ zmm26))
-; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm8
-; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm10
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm28))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm12))
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm10
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm17[0,0,1,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm16[2,2,2,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm3))
-; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm15 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[16,17,u,u]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm25
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm23[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm23[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm14
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm22[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vprold $16, %ymm14, %ymm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm22[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7,8,9],ymm3[10],ymm12[11,12],ymm3[13],ymm12[14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3],xmm3[4],xmm15[5,6],xmm3[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm15, %zmm3
+; AVX512-FCP-NEXT:    vprold $16, %xmm1, %xmm7
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm4[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm15, %zmm7
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [2,1,3,2,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm12
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm2
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm21[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7,8,9],ymm6[10],ymm9[11,12],ymm6[13],ymm9[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm9
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,2,2,3,8,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm17 ^ (zmm4 & (zmm0 ^ zmm17))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
+; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [12,13,10,10,14,14,14,14]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [14,15,11,11,15,15,14,15]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm8, %zmm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm12))
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [6,0,0,0,7,0,0,7]
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm26 ^ (zmm8 & (zmm3 ^ zmm26))
+; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm12
+; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm14
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm12
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm29))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm16[2,2,2,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,3,2]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm3))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm7))
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm16[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm16[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm14
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm7))
-; AVX512-FCP-NEXT:    vpermd (%rax), %zmm19, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm31))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm14))
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm24[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6,7,8],ymm3[9],ymm7[10,11],ymm3[12],ymm7[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm24[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7,8,9],ymm15[10],ymm7[11,12],ymm15[13],ymm7[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9))
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
+; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm7))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm3 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm7 ^ (zmm6 & (zmm10 ^ zmm7))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm23 ^ (zmm1 & (zmm21 ^ zmm23))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm30
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm10))
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm5, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm6, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm9 & mem)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6))
-; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm22))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21))
-; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
-; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm27))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm3 ^ (zmm8 & (zmm7 ^ zmm3))
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm3))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm18 ^ (zmm4 & (zmm19 ^ zmm18))
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [11,0,0,11,0,0,0,12]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,9,12,13,12,13,13,15]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm30))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm8))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 | (zmm1 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (mem & (zmm25 ^ zmm7))
+; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm20))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm19))
+; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm3
+; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm28))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm27))
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512-FCP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 64(%rax)
+; AVX512-FCP-NEXT:    addq $184, %rsp
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride7_vf32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $680, %rsp # imm = 0x2A8
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm27
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm8
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm8, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm9
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm9, %ymm0
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm4
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm17
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm10
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3]
-; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm10
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm10, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm2, %ymm4
-; AVX512DQ-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm15
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm12, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm15, %ymm4
-; AVX512DQ-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm13
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm14
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    subq $632, %rsp # imm = 0x278
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm6, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm7, %ymm2
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
-; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm26
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm4
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm6, %zmm25
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm0
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm3, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm21
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm9
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm9, %ymm2
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm28
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm29
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm30
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm31
-; AVX512DQ-NEXT:    vprold $16, %ymm13, %ymm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm14
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm14, %ymm12
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm10, %ymm13
+; AVX512DQ-NEXT:    vpor %ymm12, %ymm13, %ymm12
+; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm15, %ymm3
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm12
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm12, %ymm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm29
+; AVX512DQ-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm3
 ; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm13
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
+; AVX512DQ-NEXT:    vpor %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3]
+; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
+; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
+; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm8, %zmm5, %zmm28
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7,8,9],ymm8[10],ymm5[11,12],ymm8[13],ymm5[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7,8,9],ymm5[10],ymm8[11,12],ymm5[13],ymm8[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm10[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7,8,9,10],ymm8[11],ymm5[12,13],ymm8[14],ymm5[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7,8,9],ymm8[10],ymm5[11,12],ymm8[13],ymm5[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm19
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm5
-; AVX512DQ-NEXT:    vprold $16, %xmm5, %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
-; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm18
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512DQ-NEXT:    vprold $16, %ymm14, %ymm5
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm16
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm1
+; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[16,17,u,u]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm12
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm4[2,1,3,3]
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm6
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm29[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm26 = ymm5[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm9
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm1, %zmm27
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm6, %xmm14
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm23
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm22
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
-; AVX512DQ-NEXT:    vprold $16, %ymm16, %ymm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15]
-; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm31
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm0, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm10
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm25
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm29[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm5
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm17[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512DQ-NEXT:    vprold $16, %ymm5, %ymm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm17[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7,8,9],ymm3[10],ymm13[11,12],ymm3[13],ymm13[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm16[2,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm6 = mem[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm5 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm0 = mem[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm15 = mem[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm28
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm29
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm28))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm1 ^ (zmm30 & (zmm9 ^ zmm1))
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm3
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
-; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm3 = mem[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm10 = mem[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm11[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm8[2,1,3,3]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm1))
+; AVX512DQ-NEXT:    vmovdqa64 (%rax), %zmm2
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [11,0,0,11,0,0,0,12]
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,0,12,13,0,0,13,15]
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm17, %zmm17
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm17, %zmm1, %zmm17
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm1 = zmm1[0,1,2,3],mem[0,1,2,3]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm17 ^ (mem & (zmm1 ^ zmm17))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm8))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm11, %zmm17
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm18 & (zmm17 ^ zmm8))
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm26, %zmm13
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm17))
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15]
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm8, %zmm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm13))
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm13 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm10[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm16 = ymm23[2,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm11 = mem[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm17 = ymm30[2,3,3,3,6,7,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm27[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm10
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm12 = mem[0,0,1,1]
-; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm14 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm17 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm18 = mem[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm19 = mem[2,2,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm31[0,0,1,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm25 = ymm25[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm26 = ymm22[0,0,2,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm27 = ymm21[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm29[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm20[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm19[0,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm5))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm13 ^ (zmm21 & (zmm0 ^ zmm13))
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm3 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm5, %zmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm18 & (zmm3 ^ zmm0))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm11[0,0,0,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm16, %zmm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm17[2,1,3,2]
 ; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm4
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm2 & (zmm4 ^ zmm3))
-; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm2
-; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 | (zmm3 & mem)
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm4))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm3
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm26 ^ (mem & (zmm3 ^ zmm26))
-; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm4
-; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm5
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm25))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0))
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm23, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm24, %zmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm21 & (zmm3 ^ zmm0))
+; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm0
+; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3))
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm26, %zmm25, %zmm3
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm5 ^ (mem & (zmm3 ^ zmm5))
+; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm5
+; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm10
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm28))
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm3
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512DQ-NEXT:    vpermd (%rax), %zmm6, %zmm6
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm27, 384(%rax)
-; AVX512DQ-NEXT:    addq $680, %rsp # imm = 0x2A8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm27, %zmm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm3))
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm3
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm7, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm10))
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 128(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 256(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 192(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 64(%rcx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 384(%rcx)
+; AVX512DQ-NEXT:    addq $632, %rsp # imm = 0x278
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm16
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    subq $184, %rsp
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm11, (%rsp) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm15, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm9
-; AVX512DQ-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm23
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm9, %ymm22
+; AVX512DQ-FCP-NEXT:    vporq %ymm6, %ymm7, %ymm25
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm21
+; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm25
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm3, %zmm24
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm1, %zmm23
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [0,2,2,3,10,0,11,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,2,2,3,8,0,9,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm18
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm24[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm0, %zmm21
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,0,8,8,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm15
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm15, %zmm11, %zmm22
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm19
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,0,8,8,9]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm13
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm13, %zmm10, %zmm20
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm0, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3,4],xmm8[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm11, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm15, %xmm11
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm8, %zmm1, %zmm26
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,1,1,0,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm28
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm26
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,1,1,0,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm29
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm7, %zmm6, %zmm27
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm4, %zmm29
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8,9,10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm3, %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm10, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm5, %zmm6, %zmm28
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm15, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm15, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7,8,9,10],ymm4[11],ymm10[12,13],ymm4[14],ymm10[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm31
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm5, %zmm4, %zmm31
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm12[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm16
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm23[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm22[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [0,1,0,3,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm25, %zmm30
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm0
 ; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm1
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm30
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm25[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm20, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
-; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm19, %ymm4
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm14[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm20, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm25[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7,8,9],ymm12[10],ymm7[11,12],ymm12[13],ymm7[14,15]
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm1, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm25[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3],xmm12[4],xmm14[5,6],xmm12[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm14, %zmm12
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm3, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm1[2],xmm13[3,4],xmm1[5],xmm13[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm1, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm13, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3,4],ymm7[5],ymm10[6,7,8,9],ymm7[10],ymm10[11,12],ymm7[13],ymm10[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm24 ^ (zmm1 & (zmm0 ^ zmm24))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7]
-; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm15))
-; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm26 ^ (zmm6 & (zmm12 ^ zmm26))
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm8
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm10
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm28))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm12))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm10
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm17[0,0,1,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm16[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm15 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[16,17,u,u]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm25
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm23[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm23[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm22[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm14, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm22[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7,8,9],ymm3[10],ymm12[11,12],ymm3[13],ymm12[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3],xmm3[4],xmm15[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm15, %zmm3
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm1, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm4[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm7
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm15, %zmm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm21[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7,8,9],ymm6[10],ymm9[11,12],ymm6[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm9
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,2,2,3,8,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm17 ^ (zmm4 & (zmm0 ^ zmm17))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [12,13,10,10,14,14,14,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [14,15,11,11,15,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm12))
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [6,0,0,0,7,0,0,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm26 ^ (zmm8 & (zmm3 ^ zmm26))
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm12
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm14
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm29))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm16[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,3,2]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm3))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm7))
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm16[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm16[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm14
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vpermd (%rax), %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm31))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm14))
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm24[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6,7,8],ymm3[9],ymm7[10,11],ymm3[12],ymm7[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm24[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7,8,9],ymm15[10],ymm7[11,12],ymm15[13],ymm7[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9))
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
+; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm7))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm3 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm7 ^ (zmm6 & (zmm10 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm23 ^ (zmm1 & (zmm21 ^ zmm23))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm30
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm10))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm9 & mem)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6))
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm22))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21))
-; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm27))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm3 ^ (zmm8 & (zmm7 ^ zmm3))
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm3))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm18 ^ (zmm4 & (zmm19 ^ zmm18))
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [11,0,0,11,0,0,0,12]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,9,12,13,12,13,13,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm30))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm8))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 | (zmm1 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (mem & (zmm25 ^ zmm7))
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm20))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm19))
+; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm28))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm27))
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-FCP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, 64(%rax)
+; AVX512DQ-FCP-NEXT:    addq $184, %rsp
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -9214,7 +9167,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX-LABEL: store_i16_stride7_vf64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $1496, %rsp # imm = 0x5D8
+; AVX-NEXT:    subq $1448, %rsp # imm = 0x5A8
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    vmovdqa 112(%rsi), %xmm1
 ; AVX-NEXT:    vmovdqa 112(%rdi), %xmm4
@@ -9260,9 +9213,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX-NEXT:    vmovaps {{.*#+}} ymm15 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
-; AVX-NEXT:    vandnps %ymm11, %ymm15, %ymm11
-; AVX-NEXT:    vandps %ymm15, %ymm12, %ymm12
+; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
+; AVX-NEXT:    vandnps %ymm11, %ymm13, %ymm11
+; AVX-NEXT:    vandps %ymm13, %ymm12, %ymm12
 ; AVX-NEXT:    vorps %ymm11, %ymm12, %ymm11
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7]
@@ -9349,13 +9302,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,0]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX-NEXT:    vmovdqa (%rcx), %xmm9
-; AVX-NEXT:    vpsrld $16, %xmm9, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX-NEXT:    vmovdqa (%rcx), %xmm8
+; AVX-NEXT:    vpsrld $16, %xmm8, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm3
 ; AVX-NEXT:    vmovdqa (%rsi), %xmm1
@@ -9381,8 +9334,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm4[0,0,0,0]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm4[0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
 ; AVX-NEXT:    vandps %ymm7, %ymm5, %ymm5
@@ -9395,106 +9347,105 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm10[2,2,2,2]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm9[2,2,2,2]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6],xmm2[7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
-; AVX-NEXT:    vandnps %ymm1, %ymm12, %ymm1
-; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm0
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm4[1,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX-NEXT:    vandps %ymm0, %ymm15, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm15, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 16(%rdx), %xmm5
-; AVX-NEXT:    vmovdqa 16(%rcx), %xmm6
-; AVX-NEXT:    vpsrld $16, %xmm6, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6]
+; AVX-NEXT:    vmovdqa 16(%rdx), %xmm3
+; AVX-NEXT:    vmovdqa 16(%rcx), %xmm5
+; AVX-NEXT:    vpsrld $16, %xmm5, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,6,6]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT:    vmovdqa 16(%rsi), %xmm10
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm9
-; AVX-NEXT:    vpsrld $16, %xmm10, %xmm1
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX-NEXT:    vmovdqa 16(%rsi), %xmm9
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm8
+; AVX-NEXT:    vpsrld $16, %xmm9, %xmm1
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT:    vandnps %ymm0, %ymm15, %ymm0
-; AVX-NEXT:    vandps %ymm1, %ymm15, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
+; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovdqa 16(%r9), %xmm8
-; AVX-NEXT:    vmovdqa 16(%r8), %xmm7
-; AVX-NEXT:    vmovdqa 16(%rax), %xmm3
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX-NEXT:    vmovdqa 16(%r9), %xmm7
+; AVX-NEXT:    vmovdqa 16(%r8), %xmm6
+; AVX-NEXT:    vmovdqa 16(%rax), %xmm12
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm13[0,2],xmm3[1,3]
-; AVX-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm10[0,2],xmm12[1,3]
+; AVX-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
-; AVX-NEXT:    vandps %ymm0, %ymm14, %ymm0
-; AVX-NEXT:    vandnps %ymm1, %ymm14, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
+; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
 ; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[2,2,2,2]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
 ; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
-; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vandnps %ymm0, %ymm14, %ymm0
+; AVX-NEXT:    vandps %ymm1, %ymm14, %ymm1
 ; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3],xmm2[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 32(%rdx), %xmm15
-; AVX-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX-NEXT:    vpsrld $16, %xmm13, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX-NEXT:    vmovdqa 32(%rdx), %xmm13
+; AVX-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX-NEXT:    vpsrld $16, %xmm3, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX-NEXT:    vmovdqa 32(%rsi), %xmm2
@@ -9513,18 +9464,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 32(%r8), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 32(%rax), %xmm5
+; AVX-NEXT:    vmovdqa 32(%rax), %xmm4
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm9 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm5[0,1,0,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[0,1,0,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm5[0,0,0,0]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm11[6,7]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm4[0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX-NEXT:    vandps %ymm3, %ymm8, %ymm8
-; AVX-NEXT:    vandnps %ymm9, %ymm3, %ymm9
+; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX-NEXT:    vandps %ymm5, %ymm8, %ymm8
+; AVX-NEXT:    vandnps %ymm9, %ymm5, %ymm9
 ; AVX-NEXT:    vorps %ymm9, %ymm8, %ymm6
 ; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpsrld $16, %xmm2, %xmm8
@@ -9533,67 +9483,67 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm8, %ymm2
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm15[2,2,2,2]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm13[2,2,2,2]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6],xmm7[7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX-NEXT:    vandnps %ymm2, %ymm12, %ymm2
-; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm0
+; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT:    vandnps %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vandps %ymm3, %ymm0, %ymm0
 ; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[1,3]
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[1,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
+; AVX-NEXT:    vandps %ymm0, %ymm15, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm15, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 48(%rdx), %xmm6
 ; AVX-NEXT:    vmovdqa 48(%rcx), %xmm8
 ; AVX-NEXT:    vpsrld $16, %xmm8, %xmm0
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT:    vmovdqa 48(%rsi), %xmm3
-; AVX-NEXT:    vmovdqa 48(%rdi), %xmm13
-; AVX-NEXT:    vpsrld $16, %xmm3, %xmm1
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
-; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 48(%rsi), %xmm4
+; AVX-NEXT:    vmovdqa 48(%rdi), %xmm9
+; AVX-NEXT:    vpsrld $16, %xmm4, %xmm1
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
-; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
-; AVX-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
+; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovdqa 48(%r9), %xmm10
-; AVX-NEXT:    vmovdqa 48(%r8), %xmm9
+; AVX-NEXT:    vmovdqa 48(%r9), %xmm2
+; AVX-NEXT:    vmovdqa 48(%r8), %xmm3
 ; AVX-NEXT:    vmovdqa 48(%rax), %xmm11
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm7, %xmm2
-; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm14[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5],xmm1[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[0,2],xmm11[1,3]
-; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm14[0,2],xmm11[1,3]
+; AVX-NEXT:    vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX-NEXT:    vandps %ymm0, %ymm14, %ymm0
-; AVX-NEXT:    vandnps %ymm1, %ymm14, %ymm1
-; AVX-NEXT:    vmovaps %ymm14, %ymm5
+; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
+; AVX-NEXT:    vandps %ymm7, %ymm0, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm7, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
@@ -9604,20 +9554,20 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,2,2,2]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6],xmm1[7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
-; AVX-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; AVX-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9]
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX-NEXT:    vandnps %ymm0, %ymm3, %ymm0
-; AVX-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
+; AVX-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm1
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
@@ -9626,14 +9576,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 64(%rdx), %xmm2
-; AVX-NEXT:    vmovdqa 64(%rcx), %xmm7
-; AVX-NEXT:    vpsrld $16, %xmm7, %xmm0
+; AVX-NEXT:    vmovdqa 64(%rcx), %xmm6
+; AVX-NEXT:    vpsrld $16, %xmm6, %xmm0
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; AVX-NEXT:    vmovdqa %xmm7, %xmm0
-; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; AVX-NEXT:    vmovdqa %xmm6, %xmm0
+; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm3[0,0,1,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm1
 ; AVX-NEXT:    vmovdqa 64(%rsi), %xmm8
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm9
@@ -9644,26 +9594,24 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm7, %ymm7
-; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX-NEXT:    vandnps %ymm1, %ymm6, %ymm1
-; AVX-NEXT:    vandps %ymm6, %ymm7, %ymm7
+; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX-NEXT:    vandnps %ymm1, %ymm3, %ymm1
+; AVX-NEXT:    vandps %ymm3, %ymm7, %ymm7
 ; AVX-NEXT:    vorps %ymm1, %ymm7, %ymm10
 ; AVX-NEXT:    vmovdqa 64(%r9), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 64(%r8), %xmm6
-; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; AVX-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm13 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; AVX-NEXT:    vmovdqa 64(%rax), %xmm6
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm6[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5],xmm13[6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm7[0,1,0,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = xmm14[0,1,2],xmm6[0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
-; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX-NEXT:    vandps %ymm1, %ymm10, %ymm10
-; AVX-NEXT:    vandnps %ymm13, %ymm1, %ymm13
+; AVX-NEXT:    vandps %ymm5, %ymm10, %ymm10
+; AVX-NEXT:    vandnps %ymm13, %ymm5, %ymm13
 ; AVX-NEXT:    vorps %ymm13, %ymm10, %ymm1
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpsrld $16, %xmm8, %xmm10
@@ -9676,7 +9624,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[2,2,2,2]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6],xmm9[7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,6,6]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,6,6]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
@@ -9684,7 +9632,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vorps %ymm0, %ymm8, %ymm0
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3],xmm8[4,5,6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[1,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm7, %ymm7
@@ -9693,57 +9641,55 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vandnps %ymm7, %ymm1, %ymm7
 ; AVX-NEXT:    vorps %ymm7, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 80(%rdx), %xmm1
-; AVX-NEXT:    vmovdqa 80(%rcx), %xmm9
-; AVX-NEXT:    vpsrld $16, %xmm9, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,5,6,6]
+; AVX-NEXT:    vmovdqa 80(%rdx), %xmm9
+; AVX-NEXT:    vmovdqa 80(%rcx), %xmm8
+; AVX-NEXT:    vpsrld $16, %xmm8, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,6,6]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX-NEXT:    vmovdqa 80(%rdi), %xmm8
-; AVX-NEXT:    vmovdqa 80(%rsi), %xmm12
-; AVX-NEXT:    vpsrld $16, %xmm12, %xmm7
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
-; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm4
+; AVX-NEXT:    vmovdqa 80(%rsi), %xmm5
+; AVX-NEXT:    vpsrld $16, %xmm5, %xmm7
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm4, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm10, %ymm7
-; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
-; AVX-NEXT:    vandps %ymm4, %ymm7, %ymm7
+; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
+; AVX-NEXT:    vandnps %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vandps %ymm1, %ymm7, %ymm7
 ; AVX-NEXT:    vorps %ymm0, %ymm7, %ymm7
-; AVX-NEXT:    vmovdqa 80(%r9), %xmm4
-; AVX-NEXT:    vmovdqa 80(%r8), %xmm3
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX-NEXT:    vmovdqa 80(%r9), %xmm3
+; AVX-NEXT:    vmovdqa 80(%r8), %xmm2
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 80(%rax), %xmm2
-; AVX-NEXT:    vmovdqa %xmm6, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm15 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1]
+; AVX-NEXT:    vmovdqa 80(%rax), %xmm1
+; AVX-NEXT:    vpslldq {{.*#+}} xmm15 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4],xmm6[5],xmm15[6,7]
-; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[0,2],xmm2[1,3]
-; AVX-NEXT:    vmovdqa %xmm0, %xmm10
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm10[0,2],xmm1[1,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm6, %ymm6
-; AVX-NEXT:    vandps %ymm5, %ymm7, %ymm7
-; AVX-NEXT:    vandnps %ymm6, %ymm5, %ymm6
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
+; AVX-NEXT:    vandps %ymm0, %ymm7, %ymm7
+; AVX-NEXT:    vandnps %ymm6, %ymm0, %ymm6
 ; AVX-NEXT:    vorps %ymm6, %ymm7, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm7 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm1[2,2,2,2]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm9[2,2,2,2]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm15[6],xmm7[7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
 ; AVX-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
 ; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm7, %ymm7
@@ -9751,76 +9697,75 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vandnps %ymm6, %ymm0, %ymm6
 ; AVX-NEXT:    vandps %ymm0, %ymm7, %ymm7
 ; AVX-NEXT:    vorps %ymm6, %ymm7, %ymm6
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
-; AVX-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm2[2,2,3,3]
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm1[2,2,3,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,3],xmm7[4,5,6,7]
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3],xmm15[4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm1[3],xmm15[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm15, %ymm7
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
 ; AVX-NEXT:    vandps %ymm0, %ymm6, %ymm6
 ; AVX-NEXT:    vandnps %ymm7, %ymm0, %ymm7
 ; AVX-NEXT:    vorps %ymm7, %ymm6, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpsrld $16, %xmm8, %xmm6
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX-NEXT:    vpsrld $16, %xmm2, %xmm6
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm6[0,0,1,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm15, %ymm7
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4]
 ; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm15, %ymm14
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
 ; AVX-NEXT:    vandnps %ymm7, %ymm0, %ymm7
 ; AVX-NEXT:    vandps %ymm0, %ymm14, %ymm14
-; AVX-NEXT:    vorps %ymm7, %ymm14, %ymm14
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
-; AVX-NEXT:    vpslldq {{.*#+}} xmm15 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vorps %ymm7, %ymm14, %ymm7
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm15 = zero,zero,xmm14[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3,4],xmm13[5],xmm15[6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm7[0,1,0,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,0,0,0]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5],xmm12[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm12, %ymm12
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX-NEXT:    vandps %ymm3, %ymm14, %ymm13
-; AVX-NEXT:    vandnps %ymm12, %ymm3, %ymm12
-; AVX-NEXT:    vorps %ymm12, %ymm13, %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpsrld $16, %xmm1, %xmm12
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,1]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm15 = xmm15[0,1,2],xmm0[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm15, %ymm13
+; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX-NEXT:    vandps %ymm4, %ymm7, %ymm7
+; AVX-NEXT:    vandnps %ymm13, %ymm4, %ymm13
+; AVX-NEXT:    vorps %ymm7, %ymm13, %ymm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpsrld $16, %xmm1, %xmm7
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
 ; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm13 = mem[0,1,0,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm12, %ymm12
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm7, %ymm7
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm2[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm2[2,2,2,2]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6],xmm13[7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm3[2,2,2,2]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6],xmm13[7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm6, %ymm6
-; AVX-NEXT:    vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
-; AVX-NEXT:    vandnps %ymm12, %ymm10, %ymm12
-; AVX-NEXT:    vandps %ymm6, %ymm10, %ymm6
-; AVX-NEXT:    vorps %ymm6, %ymm12, %ymm6
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm12 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm0[3],xmm12[4,5,6,7]
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[1,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm7, %ymm7
+; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT:    vandnps %ymm7, %ymm1, %ymm7
+; AVX-NEXT:    vandps %ymm1, %ymm6, %ymm6
+; AVX-NEXT:    vorps %ymm7, %ymm6, %ymm6
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3],xmm7[4,5,6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm14[0,2],xmm0[1,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm13, %ymm7
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
 ; AVX-NEXT:    vandps %ymm0, %ymm6, %ymm6
 ; AVX-NEXT:    vandnps %ymm7, %ymm0, %ymm7
@@ -9833,152 +9778,150 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vpslldq {{.*#+}} xmm7 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[2,2,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm7, %ymm7
-; AVX-NEXT:    vandnps %ymm6, %ymm10, %ymm6
-; AVX-NEXT:    vandps %ymm7, %ymm10, %ymm7
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm2[2,2,3,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm7, %ymm7
+; AVX-NEXT:    vandnps %ymm6, %ymm1, %ymm6
+; AVX-NEXT:    vandps %ymm1, %ymm7, %ymm7
 ; AVX-NEXT:    vorps %ymm6, %ymm7, %ymm6
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm3[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm12, %ymm12
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm4[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm13 = xmm13[0],xmm4[2],xmm13[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm13, %ymm13
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
 ; AVX-NEXT:    vandnps %ymm6, %ymm5, %ymm6
-; AVX-NEXT:    vandps %ymm5, %ymm12, %ymm12
-; AVX-NEXT:    vorps %ymm6, %ymm12, %ymm4
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vandps %ymm5, %ymm13, %ymm13
+; AVX-NEXT:    vorps %ymm6, %ymm13, %ymm1
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpsrlq $48, %xmm15, %xmm6
 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm14[1],xmm6[1]
-; AVX-NEXT:    vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = mem[0,0,1,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm6, %ymm6
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm12 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX-NEXT:    vandnps %ymm6, %ymm2, %ymm6
-; AVX-NEXT:    vandps %ymm2, %ymm12, %ymm12
-; AVX-NEXT:    vorps %ymm6, %ymm12, %ymm12
-; AVX-NEXT:    vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm13 = mem[0,0,1,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm6, %ymm6
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm13 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
+; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX-NEXT:    vandnps %ymm6, %ymm1, %ymm6
+; AVX-NEXT:    vandps %ymm1, %ymm13, %ymm13
+; AVX-NEXT:    vorps %ymm6, %ymm13, %ymm13
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm6 = mem[0,1,0,1]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm2[0,0,0,0]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,5],xmm13[6,7]
+; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm12[0,0,0,0]
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm6[0,1,2],xmm14[3]
 ; AVX-NEXT:    vpsrld $16, %xmm0, %xmm6
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
-; AVX-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm4, %ymm13
-; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX-NEXT:    vandps %ymm8, %ymm12, %ymm12
-; AVX-NEXT:    vandnps %ymm13, %ymm8, %ymm13
-; AVX-NEXT:    vorps %ymm13, %ymm12, %ymm0
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
+; AVX-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm4, %ymm14
+; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX-NEXT:    vandps %ymm6, %ymm13, %ymm13
+; AVX-NEXT:    vandnps %ymm14, %ymm6, %ymm14
+; AVX-NEXT:    vorps %ymm14, %ymm13, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm12
+; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm13
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm0[1],xmm12[1]
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm0[1],xmm13[1]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm13, %ymm13
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm14, %ymm14
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX-NEXT:    vandnps %ymm12, %ymm0, %ymm12
-; AVX-NEXT:    vandps %ymm0, %ymm13, %ymm13
-; AVX-NEXT:    vorps %ymm12, %ymm13, %ymm12
+; AVX-NEXT:    vandnps %ymm13, %ymm0, %ymm13
+; AVX-NEXT:    vandps %ymm0, %ymm14, %ymm14
+; AVX-NEXT:    vorps %ymm13, %ymm14, %ymm13
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm13
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm14
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3,4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm6, %xmm13, %xmm13
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm3, %ymm13
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm3, %ymm14
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
-; AVX-NEXT:    vandps %ymm7, %ymm12, %ymm12
-; AVX-NEXT:    vandnps %ymm13, %ymm7, %ymm13
-; AVX-NEXT:    vorps %ymm13, %ymm12, %ymm0
+; AVX-NEXT:    vandps %ymm7, %ymm13, %ymm13
+; AVX-NEXT:    vandnps %ymm14, %ymm7, %ymm14
+; AVX-NEXT:    vorps %ymm14, %ymm13, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX-NEXT:    vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpslldq {{.*#+}} xmm13 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm4[2,2,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm13, %ymm13
-; AVX-NEXT:    vandnps %ymm12, %ymm10, %ymm12
-; AVX-NEXT:    vandps %ymm10, %ymm13, %ymm13
-; AVX-NEXT:    vorps %ymm12, %ymm13, %ymm12
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vpslldq {{.*#+}} xmm14 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm3[2,2,3,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm14, %ymm14
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT:    vandnps %ymm13, %ymm0, %ymm13
+; AVX-NEXT:    vandps %ymm0, %ymm14, %ymm14
+; AVX-NEXT:    vorps %ymm13, %ymm14, %ymm13
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm6[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
-; AVX-NEXT:    vandnps %ymm12, %ymm5, %ymm12
-; AVX-NEXT:    vandps %ymm5, %ymm13, %ymm13
-; AVX-NEXT:    vorps %ymm12, %ymm13, %ymm5
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm4[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm15 = xmm15[0],xmm4[2],xmm15[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm15, %ymm14
+; AVX-NEXT:    vandnps %ymm13, %ymm5, %ymm13
+; AVX-NEXT:    vandps %ymm5, %ymm14, %ymm14
+; AVX-NEXT:    vorps %ymm13, %ymm14, %ymm5
 ; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm12
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm3[1],xmm12[1]
-; AVX-NEXT:    vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = mem[0,0,1,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm12, %ymm12
+; AVX-NEXT:    vpsrlq $48, %xmm12, %xmm13
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm2[1],xmm13[1]
+; AVX-NEXT:    vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,0,1,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm13, %ymm13
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
+; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm15, %ymm14
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX-NEXT:    vandnps %ymm12, %ymm2, %ymm12
-; AVX-NEXT:    vandps %ymm2, %ymm13, %ymm13
-; AVX-NEXT:    vorps %ymm12, %ymm13, %ymm12
-; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = mem[0,1,0,1]
-; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm11[0,0,0,0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3]
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm14
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7]
+; AVX-NEXT:    vandnps %ymm13, %ymm2, %ymm13
+; AVX-NEXT:    vandps %ymm2, %ymm14, %ymm14
+; AVX-NEXT:    vorps %ymm13, %ymm14, %ymm13
+; AVX-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm14 = mem[0,1,0,1]
+; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm11[0,0,0,0]
+; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3]
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm15
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
 ; AVX-NEXT:    vpshufb %xmm15, %xmm5, %xmm5
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm5, %ymm5
-; AVX-NEXT:    vandps %ymm8, %ymm12, %ymm12
-; AVX-NEXT:    vandnps %ymm5, %ymm8, %ymm5
-; AVX-NEXT:    vorps %ymm5, %ymm12, %ymm0
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm5, %ymm5
+; AVX-NEXT:    vandps %ymm6, %ymm13, %ymm13
+; AVX-NEXT:    vandnps %ymm5, %ymm6, %ymm5
+; AVX-NEXT:    vorps %ymm5, %ymm13, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm5
@@ -9987,68 +9930,68 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm12, %ymm12
+; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm13, %ymm13
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
 ; AVX-NEXT:    vandnps %ymm5, %ymm0, %ymm5
-; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm12
-; AVX-NEXT:    vorps %ymm5, %ymm12, %ymm5
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpsrld $16, %xmm1, %xmm12
+; AVX-NEXT:    vandps %ymm0, %ymm13, %ymm13
+; AVX-NEXT:    vorps %ymm5, %ymm13, %ymm5
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm13
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm11[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0],xmm2[1],xmm13[2,3,4,5,6,7]
-; AVX-NEXT:    vpshufb %xmm15, %xmm12, %xmm12
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm2, %ymm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0],xmm2[1],xmm14[2,3,4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm2, %ymm2
 ; AVX-NEXT:    vandps %ymm7, %ymm5, %ymm5
 ; AVX-NEXT:    vandnps %ymm2, %ymm7, %ymm2
 ; AVX-NEXT:    vorps %ymm2, %ymm5, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm13, %ymm2
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX-NEXT:    vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm12, %ymm2
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpslldq {{.*#+}} xmm12 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm12, %ymm12
-; AVX-NEXT:    vandnps %ymm2, %ymm10, %ymm2
-; AVX-NEXT:    vandps %ymm10, %ymm12, %ymm12
-; AVX-NEXT:    vorps %ymm2, %ymm12, %ymm2
+; AVX-NEXT:    vpslldq {{.*#+}} xmm13 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm4[2,2,3,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm13, %ymm13
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT:    vandnps %ymm2, %ymm0, %ymm2
+; AVX-NEXT:    vandps %ymm0, %ymm13, %ymm13
+; AVX-NEXT:    vorps %ymm2, %ymm13, %ymm2
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = xmm14[0],xmm0[2],xmm14[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
 ; AVX-NEXT:    vandnps %ymm2, %ymm14, %ymm2
-; AVX-NEXT:    vandps %ymm14, %ymm12, %ymm12
-; AVX-NEXT:    vorps %ymm2, %ymm12, %ymm2
-; AVX-NEXT:    vpsrlq $48, %xmm4, %xmm11
+; AVX-NEXT:    vandps %ymm14, %ymm13, %ymm13
+; AVX-NEXT:    vorps %ymm2, %ymm13, %ymm2
+; AVX-NEXT:    vpsrlq $48, %xmm11, %xmm11
 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm3[1],xmm11[1]
 ; AVX-NEXT:    vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm12 = mem[0,0,1,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm11, %ymm11
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
 ; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm12 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm12 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
@@ -10056,22 +9999,20 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vandnps %ymm11, %ymm3, %ymm11
 ; AVX-NEXT:    vandps %ymm3, %ymm12, %ymm12
 ; AVX-NEXT:    vorps %ymm11, %ymm12, %ymm11
-; AVX-NEXT:    vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm10 = mem[0,1,0,1]
-; AVX-NEXT:    vmovdqa (%rsp), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm13[0,0,0,0]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm12[6,7]
 ; AVX-NEXT:    vpsrld $16, %xmm1, %xmm12
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm5[2],xmm12[2],xmm5[3],xmm12[3]
 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
 ; AVX-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm1, %ymm1
-; AVX-NEXT:    vandps %ymm8, %ymm11, %ymm10
-; AVX-NEXT:    vandnps %ymm1, %ymm8, %ymm1
+; AVX-NEXT:    vandps %ymm6, %ymm11, %ymm10
+; AVX-NEXT:    vandnps %ymm1, %ymm6, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm10, %ymm1
-; AVX-NEXT:    vpsrlq $48, %xmm9, %xmm4
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm0[1],xmm4[1]
+; AVX-NEXT:    vpsrlq $48, %xmm8, %xmm4
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -10086,12 +10027,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpsrld $16, %xmm3, %xmm8
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
 ; AVX-NEXT:    vpshufb %xmm15, %xmm8, %xmm6
 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm13[2,3,2,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm7, %ymm4, %ymm4
@@ -10121,8 +10062,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6,7]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm5[2],xmm6[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
 ; AVX-NEXT:    vandnps %ymm3, %ymm14, %ymm3
 ; AVX-NEXT:    vandps %ymm4, %ymm14, %ymm4
@@ -10188,7 +10128,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovaps %xmm0, 848(%rax)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 832(%rax)
-; AVX-NEXT:    addq $1496, %rsp # imm = 0x5D8
+; AVX-NEXT:    addq $1448, %rsp # imm = 0x5A8
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -12522,2661 +12462,2631 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride7_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $2840, %rsp # imm = 0xB18
-; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm6
-; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm13
-; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm7
-; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm8
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm6, %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm13, %ymm3
-; AVX512-NEXT:    vporq %ymm2, %ymm3, %ymm17
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm8, %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
-; AVX512-NEXT:    vporq %ymm2, %ymm3, %ymm18
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-NEXT:    vmovdqa 64(%r9), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa 64(%r8), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm26
-; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm27
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm3
-; AVX512-NEXT:    vpshufb %ymm10, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm23
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512-NEXT:    vpshufb %ymm11, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm22
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa (%r8), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm4
-; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm2
-; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX512-NEXT:    vpshufb %ymm10, %ymm5, %ymm0
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512-NEXT:    vpshufb %ymm9, %ymm1, %ymm9
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm0
-; AVX512-NEXT:    vpshufb %ymm12, %ymm0, %ymm10
-; AVX512-NEXT:    vpor %ymm10, %ymm9, %ymm9
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm6, %ymm9
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm31
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%r9), %ymm9
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
-; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512-NEXT:    vprold $16, %ymm9, %ymm8
-; AVX512-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm17, %zmm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm6 ^ (mem & (zmm7 ^ zmm6))
-; AVX512-NEXT:    vmovdqa 96(%r8), %ymm6
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm11)
-; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm11)
-; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm6
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm6))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512-NEXT:    vmovdqa 96(%rax), %ymm6
-; AVX512-NEXT:    vpermd %zmm6, %zmm18, %zmm7
-; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX512-NEXT:    vpandn %ymm7, %ymm12, %ymm7
-; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm19
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-NEXT:    vpbroadcastd 72(%rax), %ymm6
-; AVX512-NEXT:    vpandnq %ymm6, %ymm28, %ymm6
-; AVX512-NEXT:    vmovdqa 64(%rax), %ymm7
-; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%r9), %xmm7
-; AVX512-NEXT:    vmovdqa 64(%r8), %xmm8
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm17
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm20
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm9
-; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm7
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm14
-; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm10
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm8
-; AVX512-NEXT:    vpandnq %ymm8, %ymm28, %ymm8
-; AVX512-NEXT:    vmovdqa (%rax), %ymm12
-; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm11, %ymm12, %ymm13
-; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %xmm6
-; AVX512-NEXT:    vmovdqa (%r8), %xmm12
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm29
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm24
-; AVX512-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm25
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1]
-; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm6
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm12
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm21
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm16
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3]
-; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
-; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm15
-; AVX512-NEXT:    vpshufb %ymm11, %ymm15, %ymm11
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512-NEXT:    vpandnq %ymm12, %ymm19, %ymm12
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm11
-; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3]
-; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm6
-; AVX512-NEXT:    vpshufb %ymm6, %ymm4, %ymm11
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    subq $2648, %rsp # imm = 0xA58
+; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm3
+; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm2
+; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm1
+; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm3, %ymm4
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm9, %ymm2, %ymm5
+; AVX512-NEXT:    vporq %ymm4, %ymm5, %ymm22
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm12, %ymm0, %ymm4
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512-NEXT:    vpshufb %ymm14, %ymm1, %ymm5
+; AVX512-NEXT:    vporq %ymm4, %ymm5, %ymm23
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-NEXT:    vmovdqa 64(%r9), %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
+; AVX512-NEXT:    vmovdqa 64(%r8), %ymm5
+; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
+; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm5
+; AVX512-NEXT:    vpshufb %ymm8, %ymm5, %ymm4
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm29
+; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm6
+; AVX512-NEXT:    vpshufb %ymm9, %ymm6, %ymm5
+; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm31
+; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm5
+; AVX512-NEXT:    vpshufb %ymm12, %ymm5, %ymm4
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm20
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX512-NEXT:    vpshufb %ymm14, %ymm6, %ymm5
+; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm21
+; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa (%r9), %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
+; AVX512-NEXT:    vmovdqa (%r8), %ymm5
+; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
+; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm27
+; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm8, %ymm4, %ymm4
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm5
+; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
+; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
+; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm11
+; AVX512-NEXT:    vpshufb %ymm8, %ymm11, %ymm4
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm8
+; AVX512-NEXT:    vpshufb %ymm9, %ymm8, %ymm5
+; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm10
+; AVX512-NEXT:    vpshufb %ymm12, %ymm10, %ymm4
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512-NEXT:    vpshufb %ymm14, %ymm12, %ymm5
+; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm4
+; AVX512-NEXT:    vpshufb %ymm4, %ymm15, %ymm4
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm13
+; AVX512-NEXT:    vpshufb %ymm7, %ymm13, %ymm5
+; AVX512-NEXT:    vpor %ymm5, %ymm4, %ymm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm14, %ymm3, %ymm4
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vprold $16, %ymm0, %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%r9), %ymm4
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm27 = ymm5[3,3,3,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm5[2,2,2,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermd %zmm15, %zmm18, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm12
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vprold $16, %ymm4, %ymm1
+; AVX512-NEXT:    vpshufb %ymm7, %ymm4, %ymm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm22, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm23, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm2 ^ (mem & (zmm0 ^ zmm2))
+; AVX512-NEXT:    vmovdqa 96(%r8), %ymm2
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u,u,u],zero,zero
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm5)
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm30)
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm4[2,1,3,3]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm4 & (ymm2 ^ ymm0))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[2,2,2,2]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm26 & (ymm0 ^ ymm2))
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[0,1,2,3]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm2
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-NEXT:    vpbroadcastd 72(%rax), %ymm0
+; AVX512-NEXT:    vpandn %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512-NEXT:    vmovdqa 64(%rax), %ymm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm11
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; AVX512-NEXT:    vmovdqa 64(%r9), %xmm1
+; AVX512-NEXT:    vmovdqa 64(%r8), %xmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm22
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm19
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vprold $16, %ymm5, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm5
+; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm7
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,0,2,1]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm17
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm4[0,0,1,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rax), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm4
+; AVX512-NEXT:    vpandn %ymm4, %ymm6, %ymm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm0
-; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm2
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm12
-; AVX512-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vprold $16, %xmm2, %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa (%r9), %xmm0
+; AVX512-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm23
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm24
+; AVX512-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm25
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm3[0,0,1,1]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm6[0,0,2,1]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm6
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm18
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[0,0,1,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa %ymm14, %ymm1
+; AVX512-NEXT:    vpshufb %ymm14, %ymm11, %ymm9
+; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm8[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[0,2,2,3]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[0,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7,8,9,10],ymm9[11],ymm14[12,13],ymm9[14],ymm14[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[2,1,3,3]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vprold $16, %ymm13, %ymm9
+; AVX512-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm15[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[2,1,3,2]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm12
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm21[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm21[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm9[0,2,2,3]
+; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm8
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb %ymm1, %ymm8, %ymm9
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm29
+; AVX512-NEXT:    vmovdqa %ymm8, %ymm10
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[2,2,2,3]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm20[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7,8,9,10],ymm14[11],ymm9[12,13],ymm14[14],ymm9[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[2,2,3,3]
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vprold $16, %ymm8, %ymm9
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm20[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[2,1,3,2]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm31[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5,6,7,8],ymm14[9],ymm9[10,11],ymm14[12],ymm9[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[0,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm9
+; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm15[0,0,1,1]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm21[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8,9,10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm12[2,1,3,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm12
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm10
+; AVX512-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm11[0,0,1,1]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm20[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7,8,9],ymm11[10],ymm13[11,12],ymm11[13],ymm13[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm11[2,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vprold $16, %xmm12, %xmm11
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm15[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%r9), %xmm0
-; AVX512-NEXT:    vmovdqa 96(%r8), %xmm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpermt2d %zmm1, %zmm18, %zmm0
-; AVX512-NEXT:    vpbroadcastd 100(%rax), %ymm1
-; AVX512-NEXT:    vpbroadcastd 104(%rax), %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm31
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm0))
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm9, %xmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
-; AVX512-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512-NEXT:    vprold $16, %xmm10, %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm14
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm1 ^ (zmm0 & (zmm14 ^ zmm1))
-; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512-NEXT:    vpermt2d %zmm3, %zmm4, %zmm1
-; AVX512-NEXT:    vpbroadcastd 64(%rax), %ymm3
-; AVX512-NEXT:    vpbroadcastd 68(%rax), %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm25
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm19 & (zmm25 ^ zmm1))
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm8, %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpermt2d %zmm1, %zmm2, %zmm3
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm5
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm11
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm14[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%r9), %xmm11
+; AVX512-NEXT:    vmovdqa 96(%r8), %xmm12
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm13[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[0,1,1,3]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm16[2,3,3,3,6,7,7,7]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm14
+; AVX512-NEXT:    vpshufb %xmm14, %xmm8, %xmm11
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
+; AVX512-NEXT:    vpermt2d %zmm9, %zmm8, %zmm11
+; AVX512-NEXT:    vpbroadcastd 100(%rax), %ymm9
+; AVX512-NEXT:    vpbroadcastd 104(%rax), %ymm12
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm21
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm26 & (zmm21 ^ zmm11))
+; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm25
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm12
+; AVX512-NEXT:    vpshufb %xmm1, %xmm5, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm9, %zmm0
+; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm5
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX512-NEXT:    vprold $16, %xmm21, %xmm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3]
+; AVX512-NEXT:    vprold $16, %xmm2, %xmm2
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm17[1,1,2,3]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm0 & (zmm5 ^ zmm3))
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm2
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm1 & (zmm2 ^ zmm0))
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512-NEXT:    vpermt2d %zmm5, %zmm11, %zmm0
+; AVX512-NEXT:    vpbroadcastd 64(%rax), %ymm5
+; AVX512-NEXT:    vpbroadcastd 68(%rax), %ymm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm20
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm30 & (zmm20 ^ zmm0))
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm9, %zmm3
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512-NEXT:    vprold $16, %xmm6, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm18[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm3 ^ (zmm1 & (zmm10 ^ zmm3))
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm0
 ; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpermt2d %zmm1, %zmm4, %zmm0
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm11, %zmm0
 ; AVX512-NEXT:    vpbroadcastd (%rax), %ymm1
-; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm0))
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm31
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm30 & (zmm31 ^ zmm0))
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[2,2,2,2,6,6,6,6]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %ymm11, %ymm9, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vmovdqa %ymm3, %ymm11
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512-NEXT:    vprold $16, %ymm7, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
-; AVX512-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm9
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm10
-; AVX512-NEXT:    vprold $16, %xmm10, %xmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3]
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm1
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm3
-; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm27
-; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm1
-; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm2
-; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm13
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm1))
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,2,3,3]
+; AVX512-NEXT:    vprold $16, %ymm4, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm0[2,1,3,2]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm26 = ymm0[0,2,2,3]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm13
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm12
+; AVX512-NEXT:    vprold $16, %xmm12, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm0
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512-NEXT:    vmovdqa %xmm14, %xmm4
+; AVX512-NEXT:    vpshufb %xmm14, %xmm1, %xmm14
+; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm8, %zmm0
+; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm1
+; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm30
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm25 & (zmm30 ^ zmm0))
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm11
 ; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm18 = xmm8[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm3[2,1,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm5[2,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm22 = xmm3[0,1,1,3]
+; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm23 = mem[2,3,3,3,6,7,7,7]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm3))
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm8 & (zmm7 ^ zmm3))
 ; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm3 = mem[2,1,3,2]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm29 & (ymm3 ^ ymm1))
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm28 & (ymm30 ^ ymm3))
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm4 & (ymm3 ^ ymm7))
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm28 = ymm28 ^ (ymm16 & (ymm28 ^ ymm3))
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm3 = (zmm3 & zmm28) | mem
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm5 = (zmm5 & zmm16) | mem
+; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm16 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm16 = (zmm16 & zmm0) | mem
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm0 = (zmm0 & zmm28) | mem
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm23 ^ (zmm28 & (zmm18 ^ zmm23))
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm3 ^ (zmm2 & (zmm18 ^ zmm3))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm28 & (zmm23 ^ zmm3))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm2 & (zmm23 ^ zmm0))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm17 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm0 ^ (zmm25 & (zmm17 ^ zmm0))
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm5 ^ (zmm1 & (zmm17 ^ zmm5))
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm0 ^ (mem & (zmm17 ^ zmm0))
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm19 & (ymm0 ^ mem))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm25 & (zmm5 ^ zmm0))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm16 ^ (zmm1 & (zmm5 ^ zmm16))
+; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm0
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem))
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm19 = zmm19 | (zmm1 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm27 = ymm27 ^ (ymm1 & (ymm27 ^ ymm0))
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm28[0,1,2,3],zmm27[0,1,2,3]
+; AVX512-NEXT:    vmovdqa64 64(%rax), %zmm16
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15]
+; AVX512-NEXT:    vpermd %zmm16, %zmm1, %zmm28
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm3 & (zmm28 ^ zmm0))
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
-; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm0 = (zmm0 & zmm1) | mem
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm1))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm2 & (zmm11 ^ zmm1))
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm1 & (zmm25 ^ zmm14))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm1 & (zmm20 ^ zmm5))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm5 & (zmm2 ^ zmm1))
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 (%rax), %zmm27
+; AVX512-NEXT:    vpermd %zmm27, %zmm1, %zmm7
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm0))
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm0 & (zmm20 ^ zmm2))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm0 & (zmm31 ^ zmm10))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm3, %zmm22
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm5 & (zmm22 ^ zmm1))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512-NEXT:    vpermd 64(%rax), %zmm14, %zmm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm3, %zmm29
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm0 ^ (zmm2 & (zmm29 ^ zmm0))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512-NEXT:    vpermd %zmm16, %zmm10, %zmm2
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm1))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm24, %zmm26, %zmm1
-; AVX512-NEXT:    vpermd (%rax), %zmm14, %zmm14
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm3 & (zmm14 ^ zmm1))
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm2))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm22))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm0))
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm15, %zmm0
+; AVX512-NEXT:    vpermd %zmm27, %zmm10, %zmm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm3 & (zmm10 ^ zmm0))
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm1))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm0 & (zmm10 ^ zmm29))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm8 & (zmm9 ^ zmm0))
+; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = mem[0,1,1,3]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm1))
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[0,1,1,3]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm8 & (zmm1 ^ zmm0))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm18[0,1,1,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm26, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm18
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm0 ^ (zmm8 & (zmm18 ^ zmm0))
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = mem[0,0,0,1]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm4 & (zmm22 ^ zmm1))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm1 ^ (zmm4 & (zmm8 ^ zmm1))
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm7 = mem[2,2,2,3]
-; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm9 = mem[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
-; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm10 = mem[2,3,3,3,6,7,7,7]
-; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm16 = mem[0,0,2,1]
-; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm12 = mem[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
-; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm21 = mem[0,0,1,1]
-; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm15 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm24 = mem[2,2,2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm3
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
-; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm26 = mem[2,3,3,3,6,7,7,7]
-; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm27 = mem[0,0,2,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm8 = mem[2,1,3,2]
+; AVX512-NEXT:    vpbroadcastd 96(%rax), %ymm14
+; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm8, %zmm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm4 & (zmm14 ^ zmm0))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm22[0,0,0,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm24, %zmm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm23[2,1,3,2]
+; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm19
+; AVX512-NEXT:    vinserti64x4 $1, %ymm19, %zmm8, %zmm8
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm4 & (zmm8 ^ zmm0))
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm1))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm18))
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm6[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2,3],xmm1[4],xmm12[5,6],xmm1[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm11 = mem[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm12 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
+; AVX512-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm12 = mem[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm13 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
+; AVX512-NEXT:    vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm13 = mem[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm15 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7,8,9,10],ymm13[11],ymm15[12,13],ymm13[14],ymm15[15]
+; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm15 = mem[0,0,2,1]
+; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm4 = mem[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm15, %zmm4
+; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm15 = mem[0,0,1,1]
+; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm3 = mem[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm18 = mem[0,0,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
-; AVX512-NEXT:    vpbroadcastd 96(%rax), %ymm10
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm29 & (zmm9 ^ zmm7))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm24, %zmm3
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
-; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm10
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm7
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm3))
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm3 & (zmm9 ^ zmm22))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm8))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm16, %zmm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm21, %zmm8
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm28 & (zmm8 ^ zmm3))
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm27, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm28 & (zmm3 ^ zmm1))
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm1 & (zmm31 ^ zmm8))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm3))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm17))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm2))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm15, %zmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm4 ^ (zmm25 & (zmm3 ^ zmm4))
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm18, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm25 & (zmm1 ^ zmm0))
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm0 & (zmm21 ^ zmm3))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm0 & (zmm30 ^ zmm1))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm1, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [11,0,0,11,0,0,0,12,8,9,12,13,12,13,13,15]
+; AVX512-NEXT:    vpermd %zmm16, %zmm0, %zmm3
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm3 = zmm3 ^ (zmm6 & (zmm3 ^ mem))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm13[0,1,2,3]
+; AVX512-NEXT:    vpermd %zmm27, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm4))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm0, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm13, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm14, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm23, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm20, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm31, 704(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm9, 640(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm5, 576(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm18, 512(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm11, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm19, 768(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm30, 832(%rax)
-; AVX512-NEXT:    addq $2840, %rsp # imm = 0xB18
+; AVX512-NEXT:    vmovdqa64 %zmm30, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm8, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm10, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm5, 64(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm31, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm20, 448(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm21, 704(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm14, 640(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm17, 512(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm7, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm3, 768(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm28, 832(%rax)
+; AVX512-NEXT:    addq $2648, %rsp # imm = 0xA58
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride7_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $1576, %rsp # imm = 0x628
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm16
+; AVX512-FCP-NEXT:    subq $1288, %rsp # imm = 0x508
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm13
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm13
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm13, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm26
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm28
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
-; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm15, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm9
-; AVX512-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm9
-; AVX512-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm8, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm11
-; AVX512-FCP-NEXT:    vporq %ymm11, %ymm10, %ymm20
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm5[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm14, %ymm17
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm16, %zmm10
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm10 ^ (mem & (zmm12 ^ zmm10))
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm7
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm12 & ymm14)
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm11, %ymm6
-; AVX512-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm10 & ymm16)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm21
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm10
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm10))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm18
-; AVX512-FCP-NEXT:    vprold $16, %ymm11, %ymm10
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm16 & (ymm10 ^ ymm12))
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %ymm6
-; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vpandn %ymm10, %ymm14, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm16
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm12
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm12, %zmm10
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm10
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm24
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,2,3,10,9,11,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm22
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm30, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm19 & (zmm4 ^ zmm2))
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vporq %ymm3, %ymm4, %ymm17
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm6
+; AVX512-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm15
+; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm8
+; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm8
+; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm8
+; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm18
+; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm8
+; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm7
+; AVX512-FCP-NEXT:    vpor %ymm7, %ymm14, %ymm7
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm9
+; AVX512-FCP-NEXT:    vpor %ymm7, %ymm9, %ymm7
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm11
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm14
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm14, %ymm9
+; AVX512-FCP-NEXT:    vporq %ymm9, %ymm7, %ymm19
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm1[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm10, %ymm29
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm0[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3,4],ymm15[5],ymm9[6,7,8,9],ymm15[10],ymm9[11,12],ymm15[13],ymm9[14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm16, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm17, %zmm15
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm7 ^ (mem & (zmm15 ^ zmm7))
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm9
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm15 & ymm22)
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm7 & ymm10)
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm15, %ymm7
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,4,0,0,0,5,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm9, %ymm15, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm10 & (ymm15 ^ ymm7))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
+; AVX512-FCP-NEXT:    vprold $16, %ymm6, %ymm7
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm15))
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm5
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm0[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8,9,10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15]
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm6 = [151522058,0,421010202,421010202]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,2,2,3,10,9,11,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm15, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm5 = [218894094,0,488382238,488382238]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm7
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,2,2,3,8,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm2 ^ (zmm20 & (zmm7 ^ zmm2))
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [5,0,0,0,6,0,0,6]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpermd %ymm9, %ymm2, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7]
-; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm4
-; AVX512-FCP-NEXT:    vpandn %ymm4, %ymm6, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %ymm5
-; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm2
+; AVX512-FCP-NEXT:    vpandnq %ymm2, %ymm16, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm9, %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm13
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm13, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm28
+; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm10
+; AVX512-FCP-NEXT:    vpandnq %ymm10, %ymm16, %ymm10
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm10, %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm9
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm10
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6,7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm15, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm8
+; AVX512-FCP-NEXT:    vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm31
+; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7,8],ymm9[9],ymm13[10,11],ymm9[12],ymm13[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm13
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm10 ^ (zmm20 & (zmm3 ^ zmm10))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm23
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512-FCP-NEXT:    vprold $16, %ymm14, %ymm10
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7,8,9],ymm10[10],ymm13[11,12],ymm10[13],ymm13[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm13, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [12,13,10,10,14,14,14,14]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [14,15,11,11,15,15,14,15]
+; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm3, %zmm9
+; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm4, %zmm15
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm9, %zmm27
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm17 & (zmm27 ^ zmm10))
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vprold $16, %ymm21, %ymm9
+; AVX512-FCP-NEXT:    vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm24[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm15
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm24[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8,9,10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [2,2,3,3,10,9,11,10]
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm16, %zmm12
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
+; AVX512-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermd %ymm28, %ymm17, %ymm10
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm10, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm22 & (zmm3 ^ zmm12))
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm4
+; AVX512-FCP-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm10
+; AVX512-FCP-NEXT:    vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm18[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm12
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm18[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm12
+; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm10
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm30[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7,8,9],ymm15[10],ymm10[11,12],ymm15[13],ymm10[14,15]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm29
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm15[2],ymm8[3,4],ymm15[5],ymm8[6,7,8,9],ymm15[10],ymm8[11,12],ymm15[13],ymm8[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm13, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm12 ^ (zmm8 & (zmm4 ^ zmm12))
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm4
-; AVX512-FCP-NEXT:    vpandn %ymm4, %ymm6, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm5
-; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm10
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8,9,10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,0,3,10,10,11,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm10, %zmm19, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,1,0,3,10,10,11,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm20, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm5
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm15, %ymm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm19 & (zmm0 ^ zmm5))
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vprold $16, %ymm9, %ymm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm4))
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm19
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm18
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm29[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm30, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm27[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm13, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm3 ^ (mem & (zmm6 ^ zmm3))
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm12
-; AVX512-FCP-NEXT:    vprold $16, %ymm25, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm23
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm12, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm28
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm3
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
-; AVX512-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermd 64(%rax), %zmm20, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm16 & (zmm1 ^ zmm3))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6))
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm5
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6,7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm12, %ymm10
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm14[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm10
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm13, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm10 ^ (zmm8 & (zmm1 ^ zmm10))
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512-FCP-NEXT:    vprold $16, %ymm10, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7,8,9],ymm0[10],ymm5[11,12],ymm0[13],ymm5[14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm10, %ymm5
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7,8,9,10],ymm8[11],ymm5[12,13],ymm8[14],ymm5[15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm17, %zmm21
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm22 & (zmm21 ^ zmm5))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm1))
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm17
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,2,2,3,8,9,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm12 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm12, %zmm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm3 ^ (zmm23 & (zmm9 ^ zmm3))
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm13 = [2,2,2,3,8,8,8,9]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm12
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm8, %xmm16
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm15, %xmm18
+; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm13, %zmm11
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [6,7,3,3,7,7,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm12, %ymm2
+; AVX512-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm2, %zmm30
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm11))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (mem & (zmm30 ^ zmm9))
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,0,2,1,8,8,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm2[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (mem & (zmm1 ^ zmm5))
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm3
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm16, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%rsp) # 16-byte Spill
-; AVX512-FCP-NEXT:    vprold $16, %xmm7, %xmm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,8,8,10,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm1 & (zmm3 ^ zmm0))
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm2
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,1,1,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm24, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512-FCP-NEXT:    vprold $16, %xmm2, %xmm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa %xmm14, %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm10, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
 ; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm6
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm31
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm4
 ; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,8,9,9,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
-; AVX512-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm6
-; AVX512-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm29
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm21 & (zmm29 ^ zmm0))
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,8,9,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm18, %zmm0
+; AVX512-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm3
+; AVX512-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm22 & (zmm17 ^ zmm0))
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm0 & (zmm29 ^ zmm3))
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6],xmm3[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm16, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm10
-; AVX512-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vprold $16, %xmm10, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm31
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm25, %zmm8
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm6 ^ (zmm1 & (zmm8 ^ zmm6))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm0 & (zmm17 ^ zmm1))
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm12
+; AVX512-FCP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm1
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa %xmm6, %xmm9
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm24, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm12
+; AVX512-FCP-NEXT:    vprold $16, %xmm12, %xmm1
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm12, %xmm26
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm25
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm2 & (zmm5 ^ zmm3))
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm19
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm13, %xmm10
 ; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm27
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
-; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm3
-; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm26
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm21 & (zmm26 ^ zmm1))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm0 & (zmm26 ^ zmm8))
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm9, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm11
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm22[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm21[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm10
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm13
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,8,9,9,11]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm24, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm19[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8,9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1))
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512-FCP-NEXT:    vprold $16, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm17
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm19, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm30 & (zmm8 ^ zmm5))
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-FCP-NEXT:    vpermd (%rax), %zmm20, %zmm20
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm3))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm0))
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm21[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm24, %zmm0
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm11
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm24
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm19, %zmm15
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,8,8,9]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm21
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm14
-; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm19, %zmm7
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm0 ^ (zmm30 & (zmm15 ^ zmm0))
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [6,7,3,3,7,7,6,7]
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm23
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm0, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm30 & (zmm23 ^ zmm7))
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm31 & (zmm23 ^ zmm8))
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm4, %xmm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm8
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm16
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm19, %zmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [0,0,2,1,8,8,9,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm22, %zmm0
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm18 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm8
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm10[1],xmm13[2,3],xmm10[4],xmm13[5,6],xmm10[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm25, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm0 ^ (zmm28 & (zmm13 ^ zmm0))
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm30 & (ymm14 ^ ymm0))
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm8
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm6
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm24, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm10
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm10[0,0,1,1]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vmovdqa %xmm8, %xmm5
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm10[0,0,1,1]
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm17
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm10, %zmm10
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm30 & (zmm10 ^ zmm1))
-; AVX512-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm1
-; AVX512-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm17
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm1, %zmm19
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm17 & (zmm19 ^ zmm6))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm31 & (zmm10 ^ zmm15))
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm18, %zmm1
+; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm2
+; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm14
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm22 & (zmm14 ^ zmm1))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm5))
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm18 & (ymm13 ^ ymm7))
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm7, %ymm11
+; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[3,3,3,3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,2,2,2]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm29[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,2,3,8,9,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm8, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm7
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm7
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512-FCP-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = mem[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm10
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm22
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (mem & (zmm1 ^ zmm2))
+; AVX512-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm2
+; AVX512-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm29
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm16 & (zmm29 ^ zmm7))
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm7 = mem[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm15
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [6,7,3,3,7,7,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm28, %ymm4, %ymm15
+; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm28
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm15, %zmm28
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm18 & (zmm28 ^ zmm0))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (mem & (zmm28 ^ zmm1))
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm1 & (zmm19 ^ zmm13))
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm22, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2,3],xmm6[4],xmm13[5,6],xmm6[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm25, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm11 ^ (zmm28 & (zmm6 ^ zmm11))
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm7
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm24, %zmm5
-; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm4
-; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm7
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm5))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm6))
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ mem))
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm10, %xmm10
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3,4],xmm10[5],xmm3[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,2,1,8,8,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm15 & (zmm5 ^ zmm3))
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm0
+; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm16 & (zmm0 ^ zmm3))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm5))
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem))
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm1 & (ymm3 ^ ymm0))
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm5 & (ymm9 ^ ymm14))
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm0 = (zmm0 & zmm5) | mem
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm2 = (zmm2 & zmm5) | mem
-; AVX512-FCP-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm11 = mem ^ (ymm1 & (ymm11 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm3 & (ymm12 ^ ymm11))
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm11 & (ymm9 ^ ymm13))
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm9[0,1,2,3],zmm12[0,1,2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [12,13,10,10,14,14,14,14]
+; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [14,15,11,11,15,15,14,15]
+; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm4 ^ (mem & (zmm13 ^ zmm4))
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm6
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm7))
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm9 = mem[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpandn %ymm9, %ymm13, %ymm9
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm11, %zmm9
-; AVX512-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm11 = mem[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm10 = mem[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rax)
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm2 = (zmm2 & zmm11) | mem
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm11 ^ (zmm28 & (zmm3 ^ zmm11))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm8 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm8 = (zmm8 & zmm11) | mem
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm28 & (zmm6 ^ zmm5))
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm5 & (zmm3 ^ zmm0))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm5 & (zmm6 ^ zmm2))
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm10 ^ (zmm15 & (zmm6 ^ zmm10))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm15 & (zmm5 ^ zmm4))
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm4 & (zmm6 ^ zmm2))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm8 ^ (zmm4 & (zmm5 ^ zmm8))
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 ^ (mem & (zmm2 ^ zmm0))
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm0 = zmm0 | (zmm1 & mem)
-; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm9 = zmm9 | (zmm1 & mem)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm2))
-; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 320(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, 448(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 704(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, 640(%rax)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm1, 576(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 512(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 832(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 768(%rax)
-; AVX512-FCP-NEXT:    addq $1576, %rsp # imm = 0x628
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [11,0,0,11,0,0,0,12]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm2 ^ (mem & (zmm4 ^ zmm2))
+; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm7, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [8,9,12,13,12,13,13,15]
+; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm8, %zmm9
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ mem))
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermd %zmm9, %zmm7, %zmm7
+; AVX512-FCP-NEXT:    vpermd %zmm9, %zmm8, %zmm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ mem))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 448(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, 704(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, 640(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, 576(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 512(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 832(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512-FCP-NEXT:    addq $1288, %rsp # imm = 0x508
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride7_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $2840, %rsp # imm = 0xB18
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm6
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm13
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm7
-; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm8
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm13, %ymm3
-; AVX512DQ-NEXT:    vporq %ymm2, %ymm3, %ymm17
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm8, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
-; AVX512DQ-NEXT:    vporq %ymm2, %ymm3, %ymm18
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm26
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm27
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm23
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm22
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm5, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm1, %ymm9
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpor %ymm10, %ymm9, %ymm9
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm6, %ymm9
-; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm31
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm9
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512DQ-NEXT:    vprold $16, %ymm9, %ymm8
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm17, %zmm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm6 ^ (mem & (zmm7 ^ zmm6))
-; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm11)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm11)
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm6
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm6))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512DQ-NEXT:    vmovdqa 96(%rax), %ymm6
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm18, %zmm7
-; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX512DQ-NEXT:    vpandn %ymm7, %ymm12, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm19
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-NEXT:    vpbroadcastd 72(%rax), %ymm6
-; AVX512DQ-NEXT:    vpandnq %ymm6, %ymm28, %ymm6
-; AVX512DQ-NEXT:    vmovdqa 64(%rax), %ymm7
-; AVX512DQ-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm7
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm8
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm17
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm20
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm9
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm7
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm14
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm10
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm8
-; AVX512DQ-NEXT:    vpandnq %ymm8, %ymm28, %ymm8
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm12
-; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm12, %ymm13
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm6
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm12
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm29
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm24
-; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm25
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm6
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm12
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm21
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm16
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm15
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm15, %ymm11
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512DQ-NEXT:    vpandnq %ymm12, %ymm19, %ymm12
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm11
-; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm6
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm4, %ymm11
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    subq $2648, %rsp # imm = 0xA58
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm2, %ymm5
+; AVX512DQ-NEXT:    vporq %ymm4, %ymm5, %ymm22
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm1, %ymm5
+; AVX512DQ-NEXT:    vporq %ymm4, %ymm5, %ymm23
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm5
+; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm29
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm6
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm6, %ymm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm31
+; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm20
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm6
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm6, %ymm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm21
+; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm5
+; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm27
+; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm5
+; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm11
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm11, %ymm4
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm8
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm8, %ymm5
+; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm10
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm10, %ymm4
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm12, %ymm5
+; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm4
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm15, %ymm4
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm13
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm13, %ymm5
+; AVX512DQ-NEXT:    vpor %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vprold $16, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm4
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm27 = ymm5[3,3,3,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm5[2,2,2,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT:    vprold $16, %ymm4, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm22, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm23, %zmm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm2 ^ (mem & (zmm0 ^ zmm2))
+; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u,u,u],zero,zero
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm5)
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm30)
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm4[2,1,3,3]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm4 & (ymm2 ^ ymm0))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[2,2,2,2]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm26 & (ymm0 ^ ymm2))
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-NEXT:    vpbroadcastd 72(%rax), %ymm0
+; AVX512DQ-NEXT:    vpandn %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-NEXT:    vmovdqa 64(%rax), %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm1
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm22
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm19
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm5
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm7
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm17
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm4[0,0,1,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermd %zmm15, %zmm18, %zmm0
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm4
+; AVX512DQ-NEXT:    vpandn %ymm4, %ymm6, %ymm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm12
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm23
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm24
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm25
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm3[0,0,1,1]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm6[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm6
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm18
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[0,0,1,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm11, %ymm9
+; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm8[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[0,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm2
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[0,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm11
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7,8,9,10],ymm9[11],ymm14[12,13],ymm9[14],ymm14[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[2,1,3,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vprold $16, %ymm13, %ymm9
+; AVX512DQ-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm15[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[2,1,3,2]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vprold $16, %ymm5, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm12
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm21[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm1
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm12
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm0, %xmm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm21[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm9[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm8
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm8, %ymm9
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm29
+; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm10
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm20[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7,8,9,10],ymm14[11],ymm9[12,13],ymm14[14],ymm9[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[2,2,3,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vprold $16, %ymm8, %ymm9
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm20[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm31[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5,6,7,8],ymm14[9],ymm9[10,11],ymm14[12],ymm9[13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm9
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm15[0,0,1,1]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm21[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8,9,10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm12[2,1,3,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm12
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm11[0,0,1,1]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm20[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7,8,9],ymm11[10],ymm13[11,12],ymm11[13],ymm13[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm11[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vprold $16, %xmm12, %xmm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm15[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 96(%r8), %xmm1
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm18, %zmm0
-; AVX512DQ-NEXT:    vpbroadcastd 100(%rax), %ymm1
-; AVX512DQ-NEXT:    vpbroadcastd 104(%rax), %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm31
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm0))
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm9, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
-; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512DQ-NEXT:    vprold $16, %xmm10, %xmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm14
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm1 ^ (zmm0 & (zmm14 ^ zmm1))
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm4, %zmm1
-; AVX512DQ-NEXT:    vpbroadcastd 64(%rax), %ymm3
-; AVX512DQ-NEXT:    vpbroadcastd 68(%rax), %ymm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm25
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm19 & (zmm25 ^ zmm1))
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm8, %xmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm2, %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm14[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm11
+; AVX512DQ-NEXT:    vmovdqa 96(%r8), %xmm12
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm13[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[0,1,1,3]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm16[2,3,3,3,6,7,7,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm8, %xmm11
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
+; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm8, %zmm11
+; AVX512DQ-NEXT:    vpbroadcastd 100(%rax), %ymm9
+; AVX512DQ-NEXT:    vpbroadcastd 104(%rax), %ymm12
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm21
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm26 & (zmm21 ^ zmm11))
+; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm25
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm1, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm9, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm5
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX512DQ-NEXT:    vprold $16, %xmm21, %xmm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3]
+; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm17[1,1,2,3]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm0 & (zmm5 ^ zmm3))
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm1 & (zmm2 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm11, %zmm0
+; AVX512DQ-NEXT:    vpbroadcastd 64(%rax), %ymm5
+; AVX512DQ-NEXT:    vpbroadcastd 68(%rax), %ymm10
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm20
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm30 & (zmm20 ^ zmm0))
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm9, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512DQ-NEXT:    vprold $16, %xmm6, %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm18[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm3 ^ (zmm1 & (zmm10 ^ zmm3))
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm4, %zmm0
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm11, %zmm0
 ; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm1
-; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm0))
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm31
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm30 & (zmm31 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[2,2,2,2,6,6,6,6]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm9, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vmovdqa %ymm3, %ymm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512DQ-NEXT:    vprold $16, %ymm7, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm9
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm10
-; AVX512DQ-NEXT:    vprold $16, %xmm10, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm2, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm27
-; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm1
-; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm2
-; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm13
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm1))
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,2,3,3]
+; AVX512DQ-NEXT:    vprold $16, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm0[2,1,3,2]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm26 = ymm0[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm13
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm12
+; AVX512DQ-NEXT:    vprold $16, %xmm12, %xmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm0
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm14, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm1, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm8, %zmm0
+; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm1
+; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm6
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm30
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm25 & (zmm30 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm11
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm6
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm18 = xmm8[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm19 = ymm3[2,1,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm5[2,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm22 = xmm3[0,1,1,3]
+; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm23 = mem[2,3,3,3,6,7,7,7]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm3))
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm8 & (zmm7 ^ zmm3))
 ; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm3 = mem[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm29 & (ymm3 ^ ymm1))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm28 & (ymm30 ^ ymm3))
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm4 & (ymm3 ^ ymm7))
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm28 = ymm28 ^ (ymm16 & (ymm28 ^ ymm3))
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm3 = (zmm3 & zmm28) | mem
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm5 = (zmm5 & zmm16) | mem
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm16 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm16 = (zmm16 & zmm0) | mem
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm0 = (zmm0 & zmm28) | mem
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm23 ^ (zmm28 & (zmm18 ^ zmm23))
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm3 ^ (zmm2 & (zmm18 ^ zmm3))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm28 & (zmm23 ^ zmm3))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm2 & (zmm23 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm17 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm0 ^ (zmm25 & (zmm17 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm5 ^ (zmm1 & (zmm17 ^ zmm5))
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm0 ^ (mem & (zmm17 ^ zmm0))
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm19 & (ymm0 ^ mem))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm25 & (zmm5 ^ zmm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm16 ^ (zmm1 & (zmm5 ^ zmm16))
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem))
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm19 = zmm19 | (zmm1 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm27 = ymm27 ^ (ymm1 & (ymm27 ^ ymm0))
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm28[0,1,2,3],zmm27[0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 64(%rax), %zmm16
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15]
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm1, %zmm28
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm3 & (zmm28 ^ zmm0))
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
-; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm0 = (zmm0 & zmm1) | mem
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm2[0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm1))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm2 & (zmm11 ^ zmm1))
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm1 & (zmm25 ^ zmm14))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm1 & (zmm20 ^ zmm5))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm5 & (zmm2 ^ zmm1))
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 (%rax), %zmm27
+; AVX512DQ-NEXT:    vpermd %zmm27, %zmm1, %zmm7
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm0 & (zmm20 ^ zmm2))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm0 & (zmm31 ^ zmm10))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm22, %zmm3, %zmm22
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm5 & (zmm22 ^ zmm1))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512DQ-NEXT:    vpermd 64(%rax), %zmm14, %zmm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm3, %zmm29
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm0 ^ (zmm2 & (zmm29 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm10, %zmm2
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm1))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm24, %zmm26, %zmm1
-; AVX512DQ-NEXT:    vpermd (%rax), %zmm14, %zmm14
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm3 & (zmm14 ^ zmm1))
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm2))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm22))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm0))
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm15, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm27, %zmm10, %zmm10
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm3 & (zmm10 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm1))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm0 & (zmm10 ^ zmm29))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm8 & (zmm9 ^ zmm0))
+; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm0 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm1))
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[0,1,1,3]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm8 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm18[0,1,1,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm26, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm18
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm0 ^ (zmm8 & (zmm18 ^ zmm0))
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm0 = mem[0,0,0,1]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm4 & (zmm22 ^ zmm1))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm1 ^ (zmm4 & (zmm8 ^ zmm1))
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm7 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm9 = mem[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm10 = mem[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm16 = mem[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm12 = mem[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm21 = mem[0,0,1,1]
-; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm15 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm24 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm3
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm26 = mem[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm27 = mem[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm8 = mem[2,1,3,2]
+; AVX512DQ-NEXT:    vpbroadcastd 96(%rax), %ymm14
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm8, %zmm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm4 & (zmm14 ^ zmm0))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm22[0,0,0,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm24, %zmm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm23[2,1,3,2]
+; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm19
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm19, %zmm8, %zmm8
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm4 & (zmm8 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm1))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm18))
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm6[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2,3],xmm1[4],xmm12[5,6],xmm1[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm11 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm12 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
+; AVX512DQ-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm12 = mem[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm13 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
+; AVX512DQ-NEXT:    vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm13 = mem[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm15 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7,8,9,10],ymm13[11],ymm15[12,13],ymm13[14],ymm15[15]
+; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm15 = mem[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm4 = mem[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm15, %zmm4
+; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm15 = mem[0,0,1,1]
+; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm3 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm18 = mem[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
-; AVX512DQ-NEXT:    vpbroadcastd 96(%rax), %ymm10
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm29 & (zmm9 ^ zmm7))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm24, %zmm3
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
-; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm10
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm3))
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm3 & (zmm9 ^ zmm22))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm8))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm16, %zmm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm21, %zmm8
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm28 & (zmm8 ^ zmm3))
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm27, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm3
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm28 & (zmm3 ^ zmm1))
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm1 & (zmm31 ^ zmm8))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm3))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm17))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm2))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm15, %zmm3
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm4 ^ (zmm25 & (zmm3 ^ zmm4))
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm18, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm25 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm0 & (zmm21 ^ zmm3))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm0 & (zmm30 ^ zmm1))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [11,0,0,11,0,0,0,12,8,9,12,13,12,13,13,15]
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm0, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm3 = zmm3 ^ (zmm6 & (zmm3 ^ mem))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm13[0,1,2,3]
+; AVX512DQ-NEXT:    vpermd %zmm27, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm4))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 64(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm31, 704(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 640(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 576(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 512(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 768(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm30, 832(%rax)
-; AVX512DQ-NEXT:    addq $2840, %rsp # imm = 0xB18
+; AVX512DQ-NEXT:    vmovdqa64 %zmm30, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm31, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 448(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 704(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 640(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 512(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 768(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 832(%rax)
+; AVX512DQ-NEXT:    addq $2648, %rsp # imm = 0xA58
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $1576, %rsp # imm = 0x628
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm16
+; AVX512DQ-FCP-NEXT:    subq $1288, %rsp # imm = 0x508
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm13, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm26
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm28
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
-; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm15, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm9
-; AVX512DQ-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm9
-; AVX512DQ-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm8, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm11
-; AVX512DQ-FCP-NEXT:    vporq %ymm11, %ymm10, %ymm20
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm5[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm14, %ymm17
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm16, %zmm10
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm10 ^ (mem & (zmm12 ^ zmm10))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm12 & ymm14)
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm11, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm10 & ymm16)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm21
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm10
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm10))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm18
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm16 & (ymm10 ^ ymm12))
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vpandn %ymm10, %ymm14, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm12
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm12, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm24
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,2,3,10,9,11,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm22
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm30, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm19 & (zmm4 ^ zmm2))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vporq %ymm3, %ymm4, %ymm17
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm18
+; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm7
+; AVX512DQ-FCP-NEXT:    vpor %ymm7, %ymm14, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm9
+; AVX512DQ-FCP-NEXT:    vpor %ymm7, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm14, %ymm9
+; AVX512DQ-FCP-NEXT:    vporq %ymm9, %ymm7, %ymm19
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm1[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm10, %ymm29
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm0[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3,4],ymm15[5],ymm9[6,7,8,9],ymm15[10],ymm9[11,12],ymm15[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm16, %zmm7
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm17, %zmm15
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm7 ^ (mem & (zmm15 ^ zmm7))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm9
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm15 & ymm22)
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm7 & ymm10)
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm15, %ymm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,4,0,0,0,5,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm9, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm10 & (ymm15 ^ ymm7))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm15))
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm0[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8,9,10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm6 = [151522058,0,421010202,421010202]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,2,2,3,10,9,11,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm15, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm5 = [218894094,0,488382238,488382238]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,2,2,3,8,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm2 ^ (zmm20 & (zmm7 ^ zmm2))
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [5,0,0,0,6,0,0,6]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermd %ymm9, %ymm2, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7]
-; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm4
-; AVX512DQ-FCP-NEXT:    vpandn %ymm4, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm2
+; AVX512DQ-FCP-NEXT:    vpandnq %ymm2, %ymm16, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm13, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm28
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm10
+; AVX512DQ-FCP-NEXT:    vpandnq %ymm10, %ymm16, %ymm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm10, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6,7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm15, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm31
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7,8],ymm9[9],ymm13[10,11],ymm9[12],ymm13[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm1, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm10 ^ (zmm20 & (zmm3 ^ zmm10))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm14, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7,8,9],ymm10[10],ymm13[11,12],ymm10[13],ymm13[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm13, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [12,13,10,10,14,14,14,14]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [14,15,11,11,15,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm3, %zmm9
+; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm4, %zmm15
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm9, %zmm27
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm17 & (zmm27 ^ zmm10))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm21, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm24[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm24[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8,9,10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm16, %zmm12
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
+; AVX512DQ-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm28, %ymm17, %ymm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm10, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm22 & (zmm3 ^ zmm12))
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm18[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm18[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm1, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm30[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7,8,9],ymm15[10],ymm10[11,12],ymm15[13],ymm10[14,15]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm29
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm15[2],ymm8[3,4],ymm15[5],ymm8[6,7,8,9],ymm15[10],ymm8[11,12],ymm15[13],ymm8[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm13, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm12 ^ (zmm8 & (zmm4 ^ zmm12))
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm4
-; AVX512DQ-FCP-NEXT:    vpandn %ymm4, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8,9,10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,0,3,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm10, %zmm19, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,1,0,3,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm20, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm15, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm19 & (zmm0 ^ zmm5))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm4))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm19
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm18
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm29[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm27[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm13, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm3 ^ (mem & (zmm6 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm12
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm25, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm23
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm12, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm28
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm1, %zmm3
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
-; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermd 64(%rax), %zmm20, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm16 & (zmm1 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6))
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6,7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm12, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm14[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm13, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm10 ^ (zmm8 & (zmm1 ^ zmm10))
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm10, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7,8,9],ymm0[10],ymm5[11,12],ymm0[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm10, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7,8,9,10],ymm8[11],ymm5[12,13],ymm8[14],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm17, %zmm21
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm22 & (zmm21 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm17
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,2,2,3,8,9,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm12 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm12, %zmm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm3 ^ (zmm23 & (zmm9 ^ zmm3))
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm13 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm8, %xmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm15, %xmm18
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm13, %zmm11
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [6,7,3,3,7,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm12, %ymm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm2, %zmm30
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm11))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (mem & (zmm30 ^ zmm9))
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,0,2,1,8,8,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm2[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (mem & (zmm1 ^ zmm5))
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%rsp) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm7, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,8,8,10,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm1 & (zmm3 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm24, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm2, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm14, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm6
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm31
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm4
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,8,9,9,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm29
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm21 & (zmm29 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,8,9,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm18, %zmm0
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm22 & (zmm17 ^ zmm0))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm0 & (zmm29 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm16, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm10, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm31
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm25, %zmm8
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm6 ^ (zmm1 & (zmm8 ^ zmm6))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm0 & (zmm17 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, %xmm9
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm24, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm12
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm12, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm12, %xmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm25
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm10, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm2 & (zmm5 ^ zmm3))
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm19
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm13, %xmm10
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm27
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
-; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm26
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm21 & (zmm26 ^ zmm1))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm0 & (zmm26 ^ zmm8))
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm22[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm21[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm13
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,8,9,9,11]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm24, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm19[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8,9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1))
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm17
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm19, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm30 & (zmm8 ^ zmm5))
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT:    vpermd (%rax), %zmm20, %zmm20
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm21[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm24, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm11
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm24
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm19, %zmm15
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,8,8,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm14
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm19, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm0 ^ (zmm30 & (zmm15 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [6,7,3,3,7,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm23
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm0, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm30 & (zmm23 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm31 & (zmm23 ^ zmm8))
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm16
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm19, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [0,0,2,1,8,8,9,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm18 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm10[1],xmm13[2,3],xmm10[4],xmm13[5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm25, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm0 ^ (zmm28 & (zmm13 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm30 & (ymm14 ^ ymm0))
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm6
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm24, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm10
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm10[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, %xmm5
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm10[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm17
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm10, %zmm10
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm30 & (zmm10 ^ zmm1))
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm1
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm17
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm1, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm17 & (zmm19 ^ zmm6))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm31 & (zmm10 ^ zmm15))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm23
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm18, %zmm1
+; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm3
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm14
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm22 & (zmm14 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm18 & (ymm13 ^ ymm7))
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm7, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[3,3,3,3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm29[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,2,3,8,9,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm7
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm10
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm22
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (mem & (zmm1 ^ zmm2))
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm16 & (zmm29 ^ zmm7))
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm7 = mem[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm15
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [6,7,3,3,7,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm28, %ymm4, %ymm15
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm28
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm15, %zmm28
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm18 & (zmm28 ^ zmm0))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm28 ^ (mem & (zmm28 ^ zmm1))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm1 & (zmm19 ^ zmm13))
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm22, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2,3],xmm6[4],xmm13[5,6],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm25, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm11 ^ (zmm28 & (zmm6 ^ zmm11))
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm24, %zmm5
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm4
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm7
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm5))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm6))
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ mem))
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3,4],xmm10[5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,2,1,8,8,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm15 & (zmm5 ^ zmm3))
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm0
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm16 & (zmm0 ^ zmm3))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm5))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm1 & (ymm3 ^ ymm0))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm5 & (ymm9 ^ ymm14))
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm0 = (zmm0 & zmm5) | mem
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm2 = (zmm2 & zmm5) | mem
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm11 = mem ^ (ymm1 & (ymm11 ^ mem))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm3 & (ymm12 ^ ymm11))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm11 & (ymm9 ^ ymm13))
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm9[0,1,2,3],zmm12[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [12,13,10,10,14,14,14,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [14,15,11,11,15,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm4 ^ (mem & (zmm13 ^ zmm4))
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm7))
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm9 = mem[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpandn %ymm9, %ymm13, %ymm9
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm11, %zmm9
-; AVX512DQ-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm11 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm10 = mem[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm2 = (zmm2 & zmm11) | mem
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm11 ^ (zmm28 & (zmm3 ^ zmm11))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm8 = (zmm8 & zmm11) | mem
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm28 & (zmm6 ^ zmm5))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm5 & (zmm3 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm5 & (zmm6 ^ zmm2))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm10 ^ (zmm15 & (zmm6 ^ zmm10))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm15 & (zmm5 ^ zmm4))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm4 & (zmm6 ^ zmm2))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm8 ^ (zmm4 & (zmm5 ^ zmm8))
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm0 ^ (mem & (zmm2 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm0 = zmm0 | (zmm1 & mem)
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm9 = zmm9 | (zmm1 & mem)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm2))
-; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 704(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, 640(%rax)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm1, 576(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 512(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 832(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 768(%rax)
-; AVX512DQ-FCP-NEXT:    addq $1576, %rsp # imm = 0x628
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [11,0,0,11,0,0,0,12]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm2 ^ (mem & (zmm4 ^ zmm2))
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm7, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [8,9,12,13,12,13,13,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm8, %zmm9
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ mem))
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermd %zmm9, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermd %zmm9, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ mem))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, 704(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, 640(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 576(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 512(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 832(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512DQ-FCP-NEXT:    addq $1288, %rsp # imm = 0x508
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 78b07e5671e5a..9c8928c40d75c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -1047,7 +1047,8 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero
+; AVX2-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm13 = [4,5]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm13, %ymm13
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 20(%r9), %ymm13
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7]
@@ -2016,159 +2017,162 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $232, %rsp
 ; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm12
-; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm15
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3]
+; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX2-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3]
 ; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm9
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm10
 ; AVX2-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm0
-; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm4
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 4(%r9), %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm1
+; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 4(%r9), %ymm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[1,2,2,3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 36(%r9), %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd 32(%rcx), %xmm3
-; AVX2-FCP-NEXT:    vpbroadcastd 32(%rdx), %xmm5
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm10
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 36(%r9), %ymm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd 32(%rcx), %xmm4
+; AVX2-FCP-NEXT:    vpbroadcastd 32(%rdx), %xmm6
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm4, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 32(%r9), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq %xmm5, %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 32(%r9), %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm14
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm15
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm3[2],ymm15[2],ymm3[3],ymm15[3],ymm3[6],ymm15[6],ymm3[7],ymm15[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm14
+; AVX2-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm7 = [4,5]
+; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm7, %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 52(%r9), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd (%rcx), %xmm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd (%rcx), %xmm2
 ; AVX2-FCP-NEXT:    vpbroadcastd (%rdx), %xmm3
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd (%r9), %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm5
-; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm3[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 20(%r9), %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm8
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [2,2,3,3,2,2,3,3]
-; AVX2-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm12, %ymm9
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm9
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm12, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm15[1],ymm7[2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm15 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3]
-; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm15
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7]
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm6, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm2, %ymm6
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm12, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 48(%r9), %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm7[2,3]
+; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm6
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm0
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 20(%r9), %ymm12
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3]
+; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm9, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm8
+; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm12
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4,5,6],ymm12[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm12 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7]
-; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,6,2,3,4,6,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm1, %ymm12
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6],ymm12[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm12 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm12 = xmm4[2],mem[2],xmm4[3],mem[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm4, %ymm12
+; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm9, %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm9, %ymm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[4],ymm15[4],ymm9[5],ymm15[5]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 48(%r9), %ymm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm9 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [6,5,3,3,6,5,7,7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm9, %ymm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm1, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm2 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 16(%r9), %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm3, 160(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm6, 288(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, 256(%rax)
+; AVX2-FCP-NEXT:    vmovdqa %ymm7, 288(%rax)
+; AVX2-FCP-NEXT:    vmovdqa %ymm4, 256(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 352(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4389,338 +4393,344 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-FCP-LABEL: store_i32_stride6_vf32:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $872, %rsp # imm = 0x368
-; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX2-FCP-NEXT:    subq $936, %rsp # imm = 0x3A8
+; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm8
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm9
-; AVX2-FCP-NEXT:    vmovdqa %xmm9, (%rsp) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX2-FCP-NEXT:    vmovdqa %xmm7, (%rsp) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rcx), %xmm6
 ; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
-; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm5
-; AVX2-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3]
+; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm10
 ; AVX2-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm8
-; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm13
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 4(%r9), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm4
+; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 4(%r9), %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm9[1,2,2,3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm3
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
+; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm7[1,2,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm10[1,2,2,3]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm5
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm13[0],zero,xmm13[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 36(%r9), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 36(%r9), %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %xmm3
-; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX2-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[1,2,2,3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %xmm6
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %xmm14
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 68(%r9), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm7
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %xmm13
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm13[0],zero,xmm13[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 68(%r9), %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %xmm15
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[1,2,2,3]
-; AVX2-FCP-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %xmm3
-; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %xmm0
+; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %xmm5
+; AVX2-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[0,1,2,1]
-; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %xmm3
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm11
+; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %xmm15
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm5
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
+; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %xmm11
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 100(%r9), %ymm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd (%rcx), %xmm10
-; AVX2-FCP-NEXT:    vpbroadcastd (%rdx), %xmm12
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm8, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd (%r9), %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm7
-; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm5
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm5[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm10
+; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %xmm14
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm11 = xmm14[0],zero,xmm14[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 100(%r9), %ymm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 20(%r9), %ymm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd 32(%rcx), %xmm8
-; AVX2-FCP-NEXT:    vpbroadcastd 32(%rdx), %xmm12
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX2-FCP-NEXT:    vpbroadcastd (%rcx), %xmm10
+; AVX2-FCP-NEXT:    vpbroadcastd (%rdx), %xmm11
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm10[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd (%r9), %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps (%rdx), %ymm12
+; AVX2-FCP-NEXT:    vmovaps (%rcx), %ymm11
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm11[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps (%rsi), %ymm8
+; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovaps (%r8), %ymm4
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [4,u,5,u]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 20(%r9), %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd 32(%rcx), %xmm3
+; AVX2-FCP-NEXT:    vpbroadcastd 32(%rdx), %xmm8
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm13, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 32(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm10
-; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 52(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 32(%r9), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 32(%rdx), %ymm10
+; AVX2-FCP-NEXT:    vmovaps 32(%rcx), %ymm9
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 32(%rsi), %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovaps 32(%r8), %ymm3
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 52(%r9), %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpbroadcastd 64(%rcx), %xmm1
-; AVX2-FCP-NEXT:    vpbroadcastd 64(%rdx), %xmm4
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm14, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 64(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %ymm6
-; AVX2-FCP-NEXT:    vmovdqa 64(%rcx), %ymm4
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm6[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4],ymm1[5],ymm9[6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %ymm12
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd 64(%rdx), %xmm2
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq %xmm13, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 84(%r9), %ymm9
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 64(%r9), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd %xmm15, %xmm1
-; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FCP-NEXT:    vmovaps 64(%rdx), %ymm8
+; AVX2-FCP-NEXT:    vmovaps 64(%rcx), %ymm7
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 64(%rsi), %ymm6
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovaps 64(%r8), %ymm2
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 84(%r9), %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm15[0],xmm5[1],xmm15[1]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm11, %ymm1
+; AVX2-FCP-NEXT:    vpbroadcastq %xmm14, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 96(%r9), %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %ymm3
-; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm3[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4],ymm0[5],ymm9[6],ymm0[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %ymm9
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 116(%r9), %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vmovaps 96(%rdx), %ymm6
+; AVX2-FCP-NEXT:    vmovaps 96(%rcx), %ymm5
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 96(%rsi), %ymm13
+; AVX2-FCP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[6],ymm13[6],ymm1[7],ymm13[7]
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovaps 96(%r8), %ymm1
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm13 = [4,u,5,u]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm13, %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 116(%r9), %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm13
-; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm0
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3]
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm14
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm1, %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4,5,6],ymm11[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 16(%r9), %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm5 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = mem[2,3],ymm5[2,3]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm7, %ymm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm15, %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm11, %ymm5
-; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm11
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %ymm14
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm1, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 48(%r9), %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm5 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm13, %ymm15
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3]
+; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vmovaps (%r9), %ymm15
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm0, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[4],mem[4],ymm13[5],mem[5]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm4[4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 16(%r9), %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm11 = ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[6],ymm11[6],ymm12[7],ymm11[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm12 = mem[2,3],ymm11[2,3]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm11 = [6,5,3,3,6,5,7,7]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm11, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm12 = [4,6,2,3,4,6,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm12, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm4, %ymm13, %ymm4
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vmovaps 32(%r9), %ymm13
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm0, %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1],ymm4[2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 48(%r9), %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm9 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm9 = mem[2,3],ymm9[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm11, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3,4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm12, %ymm9
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4,5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vmovaps 64(%r9), %ymm13
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm0, %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm2[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 80(%r9), %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = mem[2,3],ymm7[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm12, %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vmovaps 96(%r9), %ymm8
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[4],mem[4],ymm7[5],mem[5]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 112(%r9), %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm5 = mem[2,3],ymm5[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm7, %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm15, %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm10, %ymm8
-; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %ymm10
-; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm1, %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 64(%r9), %ymm11
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4,5,6],ymm14[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm14 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 80(%r9), %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[2,3],ymm4[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm7, %ymm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm15, %ymm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm10, %ymm6
-; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %ymm10
-; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm1, %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 96(%r9), %ymm11
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 112(%r9), %ymm9
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[2,3],ymm2[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm7, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm15, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm11, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm12, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6],ymm5[7]
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, 736(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm6, 672(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, 640(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm4, 544(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm12, 480(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, 448(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, 352(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm13, 288(%rax)
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 256(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm0, 160(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm1, 736(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 672(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm0, 640(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm2, 544(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm10, 480(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 448(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 352(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 288(%rax)
+; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm0, 256(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm15, 160(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4749,7 +4759,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX2-FCP-NEXT:    addq $872, %rsp # imm = 0x368
+; AVX2-FCP-NEXT:    addq $936, %rsp # imm = 0x3A8
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -9031,33 +9041,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-FCP-LABEL: store_i32_stride6_vf64:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $2376, %rsp # imm = 0x948
-; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX2-FCP-NEXT:    subq $2600, %rsp # imm = 0xA28
+; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm7
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
 ; AVX2-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm8
+; AVX2-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rcx), %xmm5
 ; AVX2-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3]
 ; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
-; AVX2-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm9
+; AVX2-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,3]
 ; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm2
-; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm14
 ; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 4(%r9), %ymm4
@@ -9065,13 +9075,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3]
 ; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm4
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm14[0],zero,xmm14[1],zero
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 36(%r9), %ymm4
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
@@ -9082,657 +9092,671 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3]
 ; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %xmm15
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm13
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %xmm11
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm11[2],xmm4[3],xmm11[3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %xmm4
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 68(%r9), %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 68(%r9), %ymm10
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %xmm10
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm12
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm5[0,1,2,1]
+; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %xmm15
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm5
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %xmm8
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 100(%r9), %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm12
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %xmm10
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm13 = xmm10[0],zero,xmm10[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 100(%r9), %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 128(%rcx), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,2,2,3]
 ; AVX2-FCP-NEXT:    vmovdqa 128(%rdx), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1]
 ; AVX2-FCP-NEXT:    vmovdqa 128(%rsi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 128(%r8), %xmm5
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 132(%r9), %ymm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 128(%r8), %xmm12
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm12[0],zero,xmm12[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 132(%r9), %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm13[3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rcx), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3]
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdx), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1]
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rsi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm13
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 160(%r8), %xmm6
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 164(%r9), %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 160(%r8), %xmm0
+; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 164(%r9), %ymm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rcx), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3]
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdx), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rsi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 192(%r8), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 196(%r9), %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 224(%rcx), %xmm3
-; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 196(%r9), %ymm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 224(%rcx), %xmm8
+; AVX2-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,3]
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdx), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rsi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 224(%r8), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 228(%r9), %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd (%rcx), %xmm3
-; AVX2-FCP-NEXT:    vpbroadcastd (%rdx), %xmm14
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 228(%r9), %ymm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd (%rcx), %xmm8
+; AVX2-FCP-NEXT:    vpbroadcastd (%rdx), %xmm9
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastq %xmm2, %ymm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd (%r9), %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm14
-; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm0
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps (%rdx), %ymm1
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps (%rcx), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps (%rsi), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovaps (%r8), %ymm1
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [4,u,5,u]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 20(%r9), %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd 32(%rcx), %xmm2
+; AVX2-FCP-NEXT:    vpbroadcastd 32(%rdx), %xmm8
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
+; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm8 = xmm6[0],mem[0],xmm6[1],mem[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq %xmm14, %ymm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 32(%r9), %ymm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 20(%r9), %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd 32(%rcx), %xmm0
-; AVX2-FCP-NEXT:    vpbroadcastd 32(%rdx), %xmm1
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm11, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 32(%r9), %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %ymm0
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 32(%rdx), %ymm6
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 32(%rcx), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm6[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2],ymm2[3],ymm7[4],ymm2[5],ymm7[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 32(%rsi), %ymm7
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7]
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps 32(%r8), %ymm9
+; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm0, %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 52(%r9), %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd 64(%rcx), %xmm2
+; AVX2-FCP-NEXT:    vpbroadcastd 64(%rdx), %xmm7
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 64(%r9), %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 52(%r9), %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd 64(%rcx), %xmm0
-; AVX2-FCP-NEXT:    vpbroadcastd 64(%rdx), %xmm1
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm4, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 64(%r9), %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 64(%rcx), %ymm0
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 64(%rdx), %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 64(%rcx), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 64(%rsi), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps 64(%r8), %ymm8
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm0, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 84(%r9), %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm15[0],xmm5[1],xmm15[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq %xmm10, %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 96(%r9), %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 84(%r9), %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm8, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 96(%r9), %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 116(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd 128(%rcx), %xmm0
-; AVX2-FCP-NEXT:    vpbroadcastd 128(%rdx), %xmm4
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm5, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 128(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdx), %ymm12
-; AVX2-FCP-NEXT:    vmovdqa 128(%rcx), %ymm9
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 128(%rsi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 148(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq %xmm6, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 160(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdx), %ymm10
-; AVX2-FCP-NEXT:    vmovdqa 160(%rcx), %ymm7
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 160(%rsi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 180(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpbroadcastd %xmm11, %xmm0
+; AVX2-FCP-NEXT:    vmovaps 96(%rdx), %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 96(%rcx), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 96(%rsi), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps 96(%r8), %ymm7
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm0, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 116(%r9), %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd 128(%rcx), %xmm2
+; AVX2-FCP-NEXT:    vpbroadcastd 128(%rdx), %xmm3
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq %xmm12, %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 128(%r9), %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 128(%rdx), %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 128(%rcx), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 128(%rsi), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps 128(%r8), %ymm3
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 148(%r9), %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm4 = xmm13[0],mem[0],xmm13[1],mem[1]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 192(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 192(%rdx), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 192(%rcx), %ymm0
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 192(%rsi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 212(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vbroadcastss 224(%rcx), %xmm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 160(%r9), %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 160(%rdx), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 160(%rcx), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 160(%rsi), %ymm5
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps 160(%r8), %ymm6
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 180(%r9), %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX2-FCP-NEXT:    vbroadcastss %xmm12, %xmm2
+; AVX2-FCP-NEXT:    vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 192(%r9), %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 192(%rdx), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 192(%rcx), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 192(%rsi), %ymm5
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps 192(%r8), %ymm10
+; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 212(%r9), %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vbroadcastss 224(%rcx), %xmm2
 ; AVX2-FCP-NEXT:    vbroadcastss 224(%rdx), %xmm4
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
 ; AVX2-FCP-NEXT:    vbroadcastsd (%rsp), %ymm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 224(%r9), %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 224(%rdx), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 224(%rcx), %ymm11
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 224(%rsi), %ymm5
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps 224(%r8), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 244(%r9), %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdx), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 224(%rcx), %ymm8
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 224(%rsi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 244(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm0
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3]
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm15
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm1, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 16(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[2,3],ymm3[2,3]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm6, %ymm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm5, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm15
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm1, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %ymm13
-; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6],ymm14[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 48(%r9), %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm6, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm5, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %ymm13
-; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 64(%r9), %ymm14
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm1, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 80(%r9), %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm6, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm5, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %ymm13
-; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 96(%r9), %ymm14
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm1, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 112(%r9), %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm6, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm5, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 128(%r8), %ymm2
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 128(%r9), %ymm3
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm14 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 144(%r9), %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm6, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm5, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 160(%r8), %ymm3
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 160(%r9), %ymm2
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 176(%r9), %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 192(%r8), %ymm2
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 192(%r9), %ymm3
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 208(%r9), %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm15 = mem[2,3],ymm15[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm6, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm5, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm11, %ymm3
-; AVX2-FCP-NEXT:    vmovdqa 224(%r8), %ymm15
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm1, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vmovdqa 224(%r9), %ymm13
-; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm11 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 240(%r9), %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm8 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $19, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3]
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovaps (%r9), %ymm15
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm5, %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 16(%r9), %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm14 = mem[2,3],ymm2[2,3]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm4 = [6,5,3,3,6,5,7,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm14[2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm2 = [4,6,2,3,4,6,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm2, %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm14 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm14, %ymm1, %ymm14
+; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm5, %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovaps 32(%r9), %ymm15
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm5, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 48(%r9), %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = mem[2,3],ymm13[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm4, %ymm9
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm2, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm9 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm9, %ymm1, %ymm9
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm5, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vmovaps 64(%r9), %ymm13
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm5, %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0],ymm14[1],ymm9[2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm9 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 80(%r9), %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm9 = mem[2,3],ymm9[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm4, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm2, %ymm9
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm8 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm8, %ymm1, %ymm8
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm9
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vmovaps 96(%r9), %ymm9
+; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm5, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 112(%r9), %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm8 = mem[2,3],ymm8[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm6, %ymm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm5, %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5,6],ymm5[7]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm2, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm7
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm5, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vmovaps 128(%r9), %ymm8
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm5, %ymm9
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0],ymm9[1],ymm7[2,3,4,5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 144(%r9), %ymm9
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = mem[2,3],ymm7[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm4, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3,4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm2, %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm3
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm5, %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vmovaps 160(%r9), %ymm13
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm5, %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 176(%r9), %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm14[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm14 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm14 = mem[2,3],ymm14[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm4, %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3,4,5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm2, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm13 = xmm1[2],xmm12[2],xmm1[3],xmm12[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm13, %ymm1, %ymm13
+; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm5, %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vmovaps 192(%r9), %ymm1
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 208(%r9), %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm13 = ymm12[2],ymm0[2],ymm12[3],ymm0[3],ymm12[6],ymm0[6],ymm12[7],ymm0[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = mem[2,3],ymm13[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm4, %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0],ymm1[1],ymm10[2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm10, %ymm12, %ymm10
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vmovaps 224(%r9), %ymm13
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm5, %ymm0
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm10 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm5[0],ymm11[0],ymm5[1],ymm11[1],ymm5[4],ymm11[4],ymm5[5],ymm11[5]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 240(%r9), %ymm12
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 $19, (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[2,3],ymm11[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, 1504(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, 1440(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, 1408(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, 1312(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm4, 1248(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm10, 1216(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm7, 1120(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm9, 1056(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm12, 1024(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm14, 928(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm2, 1504(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm10, 1440(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm0, 1408(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 1312(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm15, 1248(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm14, 1216(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 1120(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 1056(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 1024(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 928(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 864(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -9809,7 +9833,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX2-FCP-NEXT:    addq $2376, %rsp # imm = 0x948
+; AVX2-FCP-NEXT:    addq $2600, %rsp # imm = 0xA28
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 62e2aadd818c1..03f3952e016ff 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -52,9 +52,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
 ; AVX-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0]
@@ -64,13 +63,14 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm0[1,0],ymm1[7,4],ymm0[5,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm0[2,1],ymm6[4,6],ymm0[6,5]
-; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm2[12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm3[12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,3],ymm1[4,6],ymm0[4,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm1
-; AVX-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,u,0,2,u,u,u,5]
-; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7]
+; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX-NEXT:    vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,u,5]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7]
 ; AVX-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm0
@@ -499,10 +499,10 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3]
 ; AVX-NEXT:    vbroadcastss 12(%rcx), %xmm2
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX-NEXT:    vmovaps %xmm0, 96(%rax)
 ; AVX-NEXT:    vmovaps %ymm3, (%rax)
 ; AVX-NEXT:    vmovaps %ymm9, 64(%rax)
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT:    vmovaps %xmm0, 96(%rax)
 ; AVX-NEXT:    vmovaps %ymm6, 32(%rax)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -6182,7 +6182,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovaps (%rax), %xmm0
 ; AVX2-FCP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 32(%rax), %xmm3
-; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovaps %xmm3, (%rsp) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vmovaps (%r8), %xmm2
 ; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -6196,21 +6196,20 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
 ; AVX2-FCP-NEXT:    vbroadcastsd %xmm1, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovaps (%rcx), %xmm2
-; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovaps (%rcx), %xmm9
 ; AVX2-FCP-NEXT:    vmovaps 32(%rcx), %xmm6
 ; AVX2-FCP-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps (%rdx), %xmm1
 ; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm9
+; AVX2-FCP-NEXT:    vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm9[1],zero
+; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm14
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm7
 ; AVX2-FCP-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovaps (%rsi), %xmm11
+; AVX2-FCP-NEXT:    vmovaps (%rsi), %xmm13
 ; AVX2-FCP-NEXT:    vmovaps 32(%rsi), %xmm8
 ; AVX2-FCP-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm11[1,1,2,2]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm13[1,1,2,2]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2],xmm2[3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
@@ -6257,9 +6256,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 96(%r8), %xmm1
 ; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 96(%r9), %xmm7
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1]
-; AVX2-FCP-NEXT:    vmovaps %xmm7, (%rsp) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 96(%r9), %xmm5
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
+; AVX2-FCP-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX2-FCP-NEXT:    vbroadcastsd %xmm0, %ymm0
 ; AVX2-FCP-NEXT:    vmovaps 96(%rax), %xmm1
@@ -6281,34 +6280,34 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm11
+; AVX2-FCP-NEXT:    vmovaps (%rsi), %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps (%rsi), %ymm13
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7]
-; AVX2-FCP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[6],ymm0[6],ymm11[7],ymm0[7]
+; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-FCP-NEXT:    vmovaps (%rdx), %ymm6
+; AVX2-FCP-NEXT:    vmovaps (%rdx), %ymm8
 ; AVX2-FCP-NEXT:    vmovaps (%rcx), %ymm10
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[6],ymm10[6],ymm6[7],ymm10[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps (%r8), %ymm8
-; AVX2-FCP-NEXT:    vmovaps (%r9), %ymm14
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6]
-; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4,5],ymm8[6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps (%r8), %ymm7
+; AVX2-FCP-NEXT:    vmovaps (%r9), %ymm6
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6]
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX2-FCP-NEXT:    vmovaps 16(%rax), %xmm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 32(%rsi), %ymm0
+; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FCP-NEXT:    vmovaps 32(%rsi), %ymm1
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -6350,47 +6349,49 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovaps 96(%rsi), %ymm2
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2]
-; AVX2-FCP-NEXT:    vmovaps 96(%rdx), %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-FCP-NEXT:    vmovaps 96(%rdx), %ymm4
 ; AVX2-FCP-NEXT:    vmovaps 96(%rcx), %ymm1
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm5[1,1],ymm1[1,1],ymm5[5,5],ymm1[5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm15[5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vbroadcastsd 112(%r8), %ymm15
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm15[0],ymm3[1,2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vbroadcastss 112(%r9), %xmm15
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 112(%rax), %ymm15
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm15[2],ymm3[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7]
+; AVX2-FCP-NEXT:    vmovaps 96(%r9), %ymm15
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm12 = [4,4,5,5]
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm12, %ymm12
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 112(%rax), %ymm12
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vbroadcastss 112(%rdx), %ymm3
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6],ymm15[7]
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 108(%r8), %ymm15
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm15 = xmm7[2,2,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7]
+; AVX2-FCP-NEXT:    vbroadcastss 112(%rdx), %ymm0
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6],ymm12[7]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 108(%r8), %ymm12
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm12 = xmm5[2,2,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6]
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm12, %ymm12
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6]
-; AVX2-FCP-NEXT:    vpermps 96(%r9), %ymm12, %ymm12
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1,2,3,4,5,6],ymm12[7]
 ; AVX2-FCP-NEXT:    vmovaps 96(%rax), %ymm15
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6401,16 +6402,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vbroadcastsd 120(%rax), %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps %xmm9, %xmm15
+; AVX2-FCP-NEXT:    vbroadcastss %xmm9, %xmm0
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX2-FCP-NEXT:    vbroadcastss %xmm12, %xmm0
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT:    vbroadcastss %xmm7, %xmm1
+; AVX2-FCP-NEXT:    vbroadcastss %xmm12, %xmm1
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FCP-NEXT:    vmovaps %xmm11, %xmm15
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm11 = [0,1,2,2,0,1,2,2]
-; AVX2-FCP-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2]
+; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm9, %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
@@ -6420,26 +6420,25 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm9[3,3],xmm15[3,3]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm14[3,3],xmm13[3,3]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm12[2],xmm15[2],xmm12[3],xmm15[3]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm9, %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm5[2,2,2,2]
-; AVX2-FCP-NEXT:    vmovaps %xmm5, %xmm9
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 8(%rax), %ymm3
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm10[1,1],ymm6[5,5],ymm10[5,5]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[1,1],ymm10[1,1],ymm8[5,5],ymm10[5,5]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm8[0,1,0,1,4,5,4,5]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[0,0,0,0,4,4,4,4]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 16(%rax), %ymm3
@@ -6454,20 +6453,20 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm9, %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vbroadcastsd (%rsp), %ymm3 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm5[3,3]
 ; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm9, %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3]
@@ -6494,125 +6493,126 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT:    vbroadcastss %xmm8, %xmm1
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT:    vbroadcastss %xmm7, %xmm2
+; AVX2-FCP-NEXT:    vbroadcastss %xmm7, %xmm1
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX2-FCP-NEXT:    vbroadcastss %xmm6, %xmm2
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm9, %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm0[3,3]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm9, %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm6[2,2,2,2]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 72(%rax), %ymm3
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm10[1,1],ymm13[5,5],ymm10[5,5]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[1,1],ymm10[1,1],ymm11[5,5],ymm10[5,5]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[0,0,0,0,4,4,4,4]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm12[0,1,0,1,4,5,4,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
-; AVX2-FCP-NEXT:    vbroadcastsd 80(%rax), %ymm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd 80(%rax), %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vbroadcastss %xmm7, %xmm1
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT:    vbroadcastss %xmm6, %xmm3
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-FCP-NEXT:    vbroadcastss %xmm5, %xmm4
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm11, %ymm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT:    vmovaps (%rsp), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6],ymm8[7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm1[3,3]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm11, %ymm4
-; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[2,2,2,2]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
-; AVX2-FCP-NEXT:    vbroadcastsd 104(%rax), %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm3[2,3,4],ymm7[5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 16(%rdx), %ymm3
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm9, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm0[3,3],xmm1[3,3]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm9, %ymm6
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm4[5,6],ymm6[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[2,2,2,2]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
+; AVX2-FCP-NEXT:    vbroadcastsd 104(%rax), %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm4[2,3,4],ymm7[5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 16(%rdx), %ymm4
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[3,3],xmm9[3,3]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm4 = xmm4[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[6],ymm1[6],ymm7[7],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3]
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm6 = xmm2[3,3],mem[3,3]
+; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm6 = xmm6[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
 ; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3,4],ymm4[5,6],ymm7[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 48(%rdx), %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 48(%rdx), %ymm6
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm7 = xmm5[3,3],mem[3,3]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[4],ymm1[4],ymm8[5],ymm1[5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm7 = xmm3[3,3],mem[3,3]
+; AVX2-FCP-NEXT:    vblendps $8, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm7 = xmm7[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7]
 ; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm7 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7]
@@ -6626,7 +6626,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vbroadcastss 80(%rdx), %ymm8
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6],ymm9[7]
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[4],ymm13[4],ymm15[5],ymm13[5]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
@@ -6634,13 +6634,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm9 = xmm9[0,1,2],mem[3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm9 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm10 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm9 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = ymm15[3,3],mem[3,3],ymm15[7,7],mem[7,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm10 = ymm12[3,3],ymm14[3,3],ymm12[7,7],ymm14[7,7]
 ; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
@@ -6649,10 +6648,10 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovaps %ymm9, 640(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm8, 544(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm7, 416(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 320(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 192(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 320(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 192(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 736(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 736(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 672(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -13003,12 +13002,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps (%r8), %ymm15
-; AVX2-FCP-NEXT:    vmovaps (%r9), %ymm13
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
+; AVX2-FCP-NEXT:    vmovaps (%r8), %ymm13
+; AVX2-FCP-NEXT:    vmovaps (%r9), %ymm1
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2],ymm1[3,4,5],ymm13[6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7]
-; AVX2-FCP-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX2-FCP-NEXT:    vmovaps 16(%rax), %xmm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
@@ -13060,11 +13059,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 96(%rsi), %ymm1
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT:    vmovaps 96(%rsi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
 ; AVX2-FCP-NEXT:    vmovaps 96(%rdx), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13106,11 +13105,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 160(%rsi), %ymm1
+; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT:    vmovaps 160(%rsi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
 ; AVX2-FCP-NEXT:    vmovaps 160(%rdx), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13129,17 +13128,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovaps 192(%rsi), %ymm12
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
-; AVX2-FCP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vmovaps 192(%rsi), %ymm8
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[6],ymm8[6],ymm11[7],ymm8[7]
+; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
 ; AVX2-FCP-NEXT:    vmovaps 192(%rdx), %ymm7
-; AVX2-FCP-NEXT:    vmovaps 192(%rcx), %ymm8
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7]
+; AVX2-FCP-NEXT:    vmovaps 192(%rcx), %ymm12
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[6],ymm12[6],ymm7[7],ymm12[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps 192(%r8), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13157,99 +13156,101 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
-; AVX2-FCP-NEXT:    vmovaps 224(%rcx), %xmm3
+; AVX2-FCP-NEXT:    vmovaps 224(%rcx), %xmm4
 ; AVX2-FCP-NEXT:    vmovaps 224(%rdx), %xmm6
-; AVX2-FCP-NEXT:    vinsertps {{.*#+}} xmm5 = zero,xmm6[1],xmm3[1],zero
+; AVX2-FCP-NEXT:    vinsertps {{.*#+}} xmm5 = zero,xmm6[1],xmm4[1],zero
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 228(%r8), %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 224(%r9), %xmm4
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm11 = xmm4[1,1,1,1]
-; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 228(%r8), %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 224(%r9), %xmm3
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1]
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, 224(%rax), %ymm5, %ymm5
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vbroadcastss %xmm3, %xmm2
+; AVX2-FCP-NEXT:    vbroadcastss %xmm4, %xmm2
 ; AVX2-FCP-NEXT:    vbroadcastss %xmm6, %xmm5
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2]
 ; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm9, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastsd 224(%r8), %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss %xmm4, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 224(%rax), %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm9, %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3],ymm14[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd 224(%r8), %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss %xmm3, %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 224(%rax), %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm14[6],ymm2[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovaps 224(%r8), %ymm6
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 232(%rax), %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovaps 224(%rsi), %ymm1
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2]
-; AVX2-FCP-NEXT:    vmovaps 224(%rdx), %ymm3
+; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovaps 224(%rsi), %ymm14
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2]
+; AVX2-FCP-NEXT:    vmovaps 224(%rdx), %ymm2
 ; AVX2-FCP-NEXT:    vmovaps 224(%rcx), %ymm0
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm14 = ymm3[1,1],ymm0[1,1],ymm3[5,5],ymm0[5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6],ymm11[7]
-; AVX2-FCP-NEXT:    vbroadcastsd 240(%r8), %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7]
-; AVX2-FCP-NEXT:    vbroadcastss 240(%r9), %xmm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 240(%rax), %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm14 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 220(%r8), %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 220(%r9), %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vbroadcastsd 216(%rax), %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm10 = ymm2[1,1],ymm0[1,1],ymm2[5,5],ymm0[5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm10[5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vbroadcastsd 240(%r8), %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm4[1,2,3,4,5,6],ymm10[7]
+; AVX2-FCP-NEXT:    vmovaps 224(%r9), %ymm4
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm15 = [4,4,5,5]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm15, %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 240(%rax), %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm10 = ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[6],ymm7[6],ymm12[7],ymm7[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 220(%r8), %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 220(%r9), %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd 216(%rax), %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm15[0],ymm10[1,2,3,4,5,6],ymm15[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vbroadcastss 240(%rdx), %ymm11
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm14 = ymm0[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6],ymm14[7]
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 236(%r8), %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm14 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6]
-; AVX2-FCP-NEXT:    vpermps 224(%r9), %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4,5,6],ymm11[7]
-; AVX2-FCP-NEXT:    vmovaps 224(%rax), %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm11[2,3],ymm14[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
+; AVX2-FCP-NEXT:    vbroadcastss 240(%rdx), %ymm10
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5],ymm10[6],ymm15[7]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[4],ymm14[4],ymm1[5],ymm14[5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 236(%r8), %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm10 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm10 = [5,6,5,6,5,6,5,6]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm10, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3,4,5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vmovaps 224(%rax), %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm15[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -13266,24 +13267,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vbroadcastss %xmm4, %xmm1
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm6[3,3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm6[3,3]
 ; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 8(%rax), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -13298,39 +13299,40 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,1,0,1,4,5,4,5]
+; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0,0,0,0,4,4,4,4]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 16(%rax), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT:    vbroadcastss %xmm3, %xmm0
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT:    vbroadcastss %xmm7, %xmm1
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT:    vbroadcastss %xmm4, %xmm0
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX2-FCP-NEXT:    vbroadcastss %xmm8, %xmm1
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm6[3,3]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 40(%rax), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
@@ -13354,30 +13356,30 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT:    vbroadcastss %xmm7, %xmm0
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT:    vbroadcastss %xmm6, %xmm1
+; AVX2-FCP-NEXT:    vbroadcastss %xmm6, %xmm0
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-FCP-NEXT:    vbroadcastss %xmm5, %xmm1
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 72(%rax), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -13412,9 +13414,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
@@ -13425,16 +13427,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 104(%rax), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -13460,9 +13462,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT:    vmovaps (%rsp), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT:    vmovaps (%rsp), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
@@ -13473,20 +13475,20 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 136(%rax), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm13[1,1],ymm1[5,5],ymm13[5,5]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = mem[0,0,0,0,4,4,4,4]
@@ -13508,9 +13510,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
@@ -13521,16 +13523,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 168(%rax), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -13556,9 +13558,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
@@ -13569,213 +13571,215 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
 ; AVX2-FCP-NEXT:    vbroadcastsd 200(%rax), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5]
-; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[1,1,1,1,5,5,5,5]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FCP-NEXT:    vbroadcastsd 208(%rax), %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm11[1,1],ymm0[5,5],ymm11[5,5]
+; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,0,0,0,4,4,4,4]
+; AVX2-FCP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1,4,5,4,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT:    vbroadcastsd 208(%rax), %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vbroadcastss 16(%rdx), %ymm0
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm1 = xmm1[3,3],mem[3,3]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm4 = xmm4[3,3],mem[3,3]
+; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm4 = xmm4[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
 ; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vbroadcastss 48(%rdx), %ymm0
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7]
+; AVX2-FCP-NEXT:    vbroadcastss 48(%rdx), %ymm4
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm6 = xmm3[3,3],mem[3,3]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm6 = xmm2[3,3],mem[3,3]
 ; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm6 = xmm6[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
 ; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 80(%rdx), %ymm0
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm4[1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 80(%rdx), %ymm4
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm6 = xmm3[3,3],mem[3,3]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm6 = xmm2[3,3],mem[3,3]
 ; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm6 = xmm6[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7]
-; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4],ymm6[5,6],ymm8[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 112(%rdx), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm4[1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 112(%rdx), %ymm4
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm8 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm1[6],ymm9[7]
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps %ymm12, %ymm1
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm9 = xmm8[3,3],mem[3,3]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm9 = xmm9[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm8 = xmm8[3,3],mem[3,3]
+; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm8 = xmm8[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
-; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3,4],ymm10[5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 144(%rdx), %ymm10
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm1[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6],ymm13[7]
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm8[0],ymm4[1,2,3,4],ymm8[5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 144(%rdx), %ymm4
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm8 = ymm13[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm15[0],ymm3[1],ymm15[1],ymm3[4],ymm15[4],ymm3[5],ymm15[5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7]
 ; AVX2-FCP-NEXT:    vmovaps (%rsp), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm13 = xmm8[3,3],mem[3,3]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm13 = xmm13[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm14 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7]
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm8 = xmm8[3,3],mem[3,3]
+; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm8 = xmm8[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm13 = ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[6],ymm3[6],ymm15[7],ymm3[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
 ; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 176(%rdx), %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4],ymm13[5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 176(%rdx), %ymm13
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm15 = xmm8[3,3],mem[3,3]
+; AVX2-FCP-NEXT:    vmovaps %ymm14, %ymm3
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm15 = xmm14[3,3],mem[3,3]
 ; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm15 = xmm15[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2,3],ymm14[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0],ymm15[1,2,3],ymm13[4,5,6,7]
 ; AVX2-FCP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3]
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7]
-; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7]
-; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm12[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT:    vbroadcastss 208(%rdx), %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7]
+; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7]
+; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm14 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm11[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT:    vbroadcastss 208(%rdx), %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6],ymm15[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm8 = xmm1[3,3],mem[3,3]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm8 = xmm8[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm14 = xmm1[3,3],mem[3,3]
+; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm14 = xmm14[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 1440(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 1440(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 1312(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm14, 1216(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm13, 1088(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm10, 992(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm13, 1216(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 1088(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 992(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm9, 864(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 768(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm12, 768(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm6, 640(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 544(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm2, 544(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm7, 416(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm5, 320(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
index fba3aa50af23d..7e9739cbdbd39 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
@@ -464,7 +464,7 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm4[3,0],ymm8[7,4],ymm4[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[2,3],ymm4[6,4],ymm6[6,7]
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,3],xmm0[3,3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
 ; AVX-NEXT:    vmovaps %ymm0, 96(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index 05c111ae5049f..8362ff7d777c4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -767,8 +767,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],mem[0],ymm1[2],mem[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm3[2,3],ymm12[4,5,6,7]
-; AVX-NEXT:    vmovapd 48(%rsi), %xmm13
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm10[1]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm10 = mem[0],xmm10[1]
 ; AVX-NEXT:    vbroadcastsd 56(%rcx), %ymm13
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3]
@@ -1703,7 +1702,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovaps 32(%rdx), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -1712,7 +1711,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm13 = ymm1[0,1,2],ymm0[3]
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm15 = ymm0[0,1],ymm1[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovaps 64(%rdi), %ymm0
 ; AVX-NEXT:    vbroadcastsd 72(%rsi), %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
@@ -1735,24 +1734,23 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm12[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm9[2,3],ymm15[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
+; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3]
-; AVX-NEXT:    vmovapd 48(%rdx), %xmm6
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3]
-; AVX-NEXT:    vmovapd 48(%rsi), %xmm8
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1]
-; AVX-NEXT:    vbroadcastsd 56(%rcx), %ymm8
-; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3]
-; AVX-NEXT:    vmovapd 32(%r8), %ymm8
-; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm8[0],ymm13[1,2,3]
-; AVX-NEXT:    vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm15[0],ymm8[1],ymm15[2,3]
-; AVX-NEXT:    vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm8[2],ymm5[3]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm8[3]
+; AVX-NEXT:    vmovapd 48(%rdx), %xmm8
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm8[0],ymm5[0],ymm8[2],ymm5[3]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm8 = mem[0],xmm8[1]
+; AVX-NEXT:    vbroadcastsd 56(%rcx), %ymm9
+; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3]
+; AVX-NEXT:    vmovapd 32(%r8), %ymm9
+; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm9[0],ymm13[1,2,3]
+; AVX-NEXT:    vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3]
+; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm9[2],ymm5[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm8[0,1,2],ymm9[3]
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm7[0],mem[0]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
@@ -1766,8 +1764,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm2[2,3]
 ; AVX-NEXT:    vmovapd 112(%rdx), %xmm2
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[3]
-; AVX-NEXT:    vmovapd 112(%rsi), %xmm1
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm2[1]
 ; AVX-NEXT:    vbroadcastsd 120(%rcx), %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovapd 96(%r8), %ymm0
@@ -3506,16 +3503,16 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-LABEL: store_i64_stride5_vf32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    subq $1048, %rsp # imm = 0x418
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm9
-; AVX-NEXT:    vmovapd 160(%rdi), %ymm7
-; AVX-NEXT:    vmovapd 96(%rdi), %ymm5
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm10
+; AVX-NEXT:    vmovapd 160(%rdi), %ymm9
+; AVX-NEXT:    vmovapd 96(%rdi), %ymm14
 ; AVX-NEXT:    vmovaps 128(%rcx), %ymm0
 ; AVX-NEXT:    vmovaps (%rcx), %ymm1
 ; AVX-NEXT:    vmovaps 64(%rcx), %ymm2
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = mem[2,3,2,3]
-; AVX-NEXT:    vmovaps 16(%rdx), %xmm6
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3]
+; AVX-NEXT:    vmovaps 16(%rdx), %xmm5
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
@@ -3524,7 +3521,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2]
 ; AVX-NEXT:    vmovapd 96(%rcx), %xmm2
 ; AVX-NEXT:    vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -3536,13 +3533,13 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm7[0],mem[0],ymm7[2],mem[2]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2]
 ; AVX-NEXT:    vmovapd 160(%rcx), %xmm1
 ; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm10[0],mem[0],ymm10[2],mem[2]
 ; AVX-NEXT:    vmovaps 192(%rcx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
@@ -3567,32 +3564,32 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 32(%rdi), %ymm2
+; AVX-NEXT:    vmovapd 32(%rdi), %ymm3
 ; AVX-NEXT:    vbroadcastsd 40(%rsi), %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovaps 32(%rdx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm14
+; AVX-NEXT:    vmovaps 64(%rdi), %ymm2
 ; AVX-NEXT:    vbroadcastsd 72(%rsi), %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovaps 64(%rdx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vbroadcastsd 104(%rsi), %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovaps 96(%rdx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -3601,212 +3598,208 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovaps 128(%rdi), %ymm1
 ; AVX-NEXT:    vbroadcastsd 136(%rsi), %ymm0
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovaps 128(%rdx), %xmm3
-; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT:    vmovaps 128(%rdx), %xmm4
+; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm4[4,5,6,7]
 ; AVX-NEXT:    vbroadcastsd 168(%rsi), %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovaps 160(%rdx), %xmm4
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm3
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3]
+; AVX-NEXT:    vmovaps 160(%rdx), %xmm6
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm4
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vbroadcastsd 200(%rsi), %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovaps 192(%rdx), %xmm3
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm6
-; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovaps 192(%rdx), %xmm4
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm5
+; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm5[6,7]
 ; AVX-NEXT:    vmovapd 224(%rdi), %ymm0
-; AVX-NEXT:    vbroadcastsd 232(%rsi), %ymm6
-; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3]
-; AVX-NEXT:    vmovaps 224(%rdx), %xmm11
-; AVX-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm6[0,1,2],ymm11[3]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm6 = mem[2,3,2,3]
+; AVX-NEXT:    vbroadcastsd 232(%rsi), %ymm5
+; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm0[0,1],ymm5[2,3]
+; AVX-NEXT:    vmovaps 224(%rdx), %xmm5
+; AVX-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm11
+; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm7 = mem[2,3,2,3]
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm5[0],mem[0]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
 ; AVX-NEXT:    vmovaps (%r8), %ymm15
-; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = mem[0,1,2,3,4,5],ymm15[6,7]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = ymm15[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7]
-; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3]
+; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = mem[0,1,2,3,4,5],ymm15[6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = ymm15[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3]
 ; AVX-NEXT:    vmovapd 48(%rdx), %xmm8
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[0],ymm2[0],ymm8[2],ymm2[3]
-; AVX-NEXT:    vmovapd 48(%rsi), %xmm15
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm15[1],xmm8[1]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[3]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm8 = mem[0],xmm8[1]
 ; AVX-NEXT:    vbroadcastsd 56(%rcx), %ymm15
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3]
 ; AVX-NEXT:    vmovapd 32(%r8), %ymm15
-; AVX-NEXT:    vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = ymm15[0],mem[1,2,3]
-; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = mem[0],ymm15[1],mem[2,3]
-; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm15[3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX-NEXT:    vmovaps 64(%r8), %ymm8
-; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = mem[0,1,2,3,4,5],ymm8[6,7]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps $252, (%rsp), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = ymm8[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups %ymm13, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = mem[0,1],ymm8[2,3],mem[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7]
+; AVX-NEXT:    vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = ymm15[0],mem[1,2,3]
+; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = mem[0],ymm15[1],mem[2,3]
+; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm15[2],ymm3[3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm8[0,1,2],ymm15[3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX-NEXT:    vmovaps 64(%r8), %ymm3
+; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = mem[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $252, (%rsp), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = ymm3[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups %ymm5, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = mem[0,1],ymm3[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm5[2,3]
-; AVX-NEXT:    vmovapd 112(%rdx), %xmm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[3]
-; AVX-NEXT:    vmovapd 112(%rsi), %xmm8
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm5[1]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm14[2,3]
+; AVX-NEXT:    vmovapd 112(%rdx), %xmm3
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
 ; AVX-NEXT:    vbroadcastsd 120(%rcx), %ymm8
-; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3]
 ; AVX-NEXT:    vmovapd 96(%r8), %ymm8
-; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = mem[0],ymm8[1],mem[2,3]
-; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = ymm8[0],mem[1,2,3]
-; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = mem[0],ymm8[1],mem[2,3]
+; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm5 = ymm8[0],mem[1,2,3]
+; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm8[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm8[3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovaps 128(%r8), %ymm2
-; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = ymm2[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm2[2,3],ymm10[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = mem[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm2[2,3],ymm13[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm7[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm9[2,3]
 ; AVX-NEXT:    vmovapd 176(%rdx), %xmm2
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3]
-; AVX-NEXT:    vmovapd 176(%rsi), %xmm5
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm2[1]
-; AVX-NEXT:    vbroadcastsd 184(%rcx), %ymm5
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3]
-; AVX-NEXT:    vmovapd 160(%r8), %ymm5
-; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm7 = mem[0],ymm5[1],mem[2,3]
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm7 = ymm5[0],mem[1,2,3]
-; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; AVX-NEXT:    vbroadcastsd 184(%rcx), %ymm3
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
+; AVX-NEXT:    vmovapd 160(%r8), %ymm8
+; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = mem[0],ymm8[1],mem[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = ymm8[0],mem[1,2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm5[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm8[3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
 ; AVX-NEXT:    vmovaps 192(%r8), %ymm2
-; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm3 = mem[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
 ; AVX-NEXT:    vmovapd 240(%rdx), %xmm1
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
-; AVX-NEXT:    vmovapd 240(%rsi), %xmm2
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
 ; AVX-NEXT:    vbroadcastsd 248(%rcx), %ymm2
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
-; AVX-NEXT:    vmovapd 224(%r8), %ymm5
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm5[0],ymm11[1,2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3]
-; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT:    vmovapd 224(%r8), %ymm15
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm15[0],ymm11[1,2,3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm7[0],ymm15[1],ymm7[2,3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm5[3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm15[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = xmm0[0],mem[0]
-; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm5 = xmm4[0],mem[0]
-; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm2 = xmm3[0],mem[0]
-; AVX-NEXT:    vmovaps 128(%rdi), %xmm1
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm1[0],mem[0]
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm5 = xmm0[0],mem[0]
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm6 = xmm6[0],mem[0]
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm4[0],mem[0]
+; AVX-NEXT:    vmovaps 128(%rdi), %xmm4
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0]
-; AVX-NEXT:    vmovaps 64(%rdi), %xmm14
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovaps 64(%rdi), %xmm12
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0]
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm12
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
+; AVX-NEXT:    vmovaps 32(%rdi), %xmm8
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX-NEXT:    vmovaps 96(%rdi), %xmm15
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0]
-; AVX-NEXT:    vmovaps 160(%rdi), %xmm13
+; AVX-NEXT:    vmovaps 96(%rdi), %xmm13
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
+; AVX-NEXT:    vmovaps 160(%rdi), %xmm10
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
 ; AVX-NEXT:    vmovaps 224(%rdi), %xmm9
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX-NEXT:    vmovaps 192(%rdi), %xmm11
+; AVX-NEXT:    vmovaps 192(%rdi), %xmm14
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
+; AVX-NEXT:    vmovaps (%rdi), %xmm11
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0]
-; AVX-NEXT:    vmovaps (%rdi), %xmm10
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0]
-; AVX-NEXT:    vmovaps %xmm8, 16(%r9)
-; AVX-NEXT:    vmovaps %xmm10, (%r9)
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0]
+; AVX-NEXT:    vmovaps %xmm7, 16(%r9)
+; AVX-NEXT:    vmovaps %xmm11, (%r9)
 ; AVX-NEXT:    vmovaps %xmm2, 976(%r9)
-; AVX-NEXT:    vmovaps %xmm11, 960(%r9)
+; AVX-NEXT:    vmovaps %xmm14, 960(%r9)
 ; AVX-NEXT:    vmovaps %xmm0, 1136(%r9)
 ; AVX-NEXT:    vmovaps %xmm9, 1120(%r9)
-; AVX-NEXT:    vmovaps %xmm5, 816(%r9)
-; AVX-NEXT:    vmovaps %xmm13, 800(%r9)
-; AVX-NEXT:    vmovaps %xmm7, 496(%r9)
-; AVX-NEXT:    vmovaps %xmm15, 480(%r9)
+; AVX-NEXT:    vmovaps %xmm6, 816(%r9)
+; AVX-NEXT:    vmovaps %xmm10, 800(%r9)
+; AVX-NEXT:    vmovaps %xmm5, 496(%r9)
+; AVX-NEXT:    vmovaps %xmm13, 480(%r9)
 ; AVX-NEXT:    vmovaps %xmm1, 176(%r9)
-; AVX-NEXT:    vmovaps %xmm12, 160(%r9)
+; AVX-NEXT:    vmovaps %xmm8, 160(%r9)
 ; AVX-NEXT:    vmovaps %xmm3, 336(%r9)
-; AVX-NEXT:    vmovaps %xmm14, 320(%r9)
-; AVX-NEXT:    vmovaps %xmm4, 656(%r9)
-; AVX-NEXT:    vmovaps %xmm6, 640(%r9)
+; AVX-NEXT:    vmovaps %xmm12, 320(%r9)
+; AVX-NEXT:    vmovaps %xmm15, 656(%r9)
+; AVX-NEXT:    vmovaps %xmm4, 640(%r9)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, 1216(%r9)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7394,16 +7387,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-LABEL: store_i64_stride5_vf64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    subq $2264, %rsp # imm = 0x8D8
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm14
-; AVX-NEXT:    vmovaps 160(%rdi), %ymm4
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm5
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm4
+; AVX-NEXT:    vmovaps 160(%rdi), %ymm5
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm11
 ; AVX-NEXT:    vmovaps 64(%rcx), %ymm1
 ; AVX-NEXT:    vmovaps 128(%rcx), %ymm0
 ; AVX-NEXT:    vmovaps (%rcx), %ymm2
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = mem[2,3,2,3]
-; AVX-NEXT:    vmovaps 16(%rdx), %xmm6
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3]
+; AVX-NEXT:    vmovaps 16(%rdx), %xmm7
+; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = mem[2,3,2,3]
@@ -7412,10 +7405,10 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2]
+; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 96(%rcx), %xmm2
-; AVX-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7425,16 +7418,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2]
-; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],mem[0],ymm5[2],mem[2]
+; AVX-NEXT:    vmovaps %ymm5, %ymm3
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 160(%rcx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2]
-; AVX-NEXT:    vmovaps %ymm14, %ymm2
-; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2]
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 192(%rcx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
@@ -7454,16 +7447,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 288(%rdi), %ymm14
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2]
-; AVX-NEXT:    vmovapd 288(%rcx), %xmm1
-; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX-NEXT:    vmovaps 288(%rdi), %ymm10
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm10[0],mem[0],ymm10[2],mem[2]
 ; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 288(%rcx), %xmm1
+; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 320(%rdi), %ymm2
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 320(%rcx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
@@ -7498,8 +7492,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 480(%rdi), %ymm13
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm13[0],mem[0],ymm13[2],mem[2]
+; AVX-NEXT:    vmovapd 480(%rdi), %ymm15
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],mem[0],ymm15[2],mem[2]
 ; AVX-NEXT:    vmovapd 480(%rcx), %xmm1
 ; AVX-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
@@ -7517,32 +7511,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 32(%rdi), %ymm6
+; AVX-NEXT:    vmovapd 32(%rdi), %ymm7
 ; AVX-NEXT:    vbroadcastsd 40(%rsi), %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovaps 32(%rdx), %xmm1
-; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 64(%rdi), %ymm7
+; AVX-NEXT:    vmovaps 64(%rdi), %ymm6
 ; AVX-NEXT:    vbroadcastsd 72(%rsi), %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovaps 64(%rdx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vbroadcastsd 104(%rsi), %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovaps 96(%rdx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -7561,14 +7555,14 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vbroadcastsd 168(%rsi), %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovaps 160(%rdx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vbroadcastsd 200(%rsi), %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovaps 192(%rdx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -7599,14 +7593,14 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vbroadcastsd 296(%rsi), %ymm0
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovaps 288(%rdx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vbroadcastsd 328(%rsi), %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovaps 320(%rdx), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -7626,76 +7620,74 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vbroadcastsd 392(%rsi), %ymm0
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vmovaps 384(%rdx), %xmm15
-; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm1
+; AVX-NEXT:    vmovaps 384(%rdx), %xmm14
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovapd 416(%rdi), %ymm0
-; AVX-NEXT:    vbroadcastsd 424(%rsi), %ymm1
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3]
+; AVX-NEXT:    vmovapd 416(%rdi), %ymm1
+; AVX-NEXT:    vbroadcastsd 424(%rsi), %ymm0
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovaps 416(%rdx), %xmm9
 ; AVX-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3]
-; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 448(%rdi), %ymm1
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3]
+; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 448(%rdi), %ymm0
 ; AVX-NEXT:    vbroadcastsd 456(%rsi), %ymm9
-; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm9[4,5,6,7]
 ; AVX-NEXT:    vmovaps 448(%rdx), %xmm10
 ; AVX-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpermilps {{.*#+}} xmm9 = mem[2,3,2,3]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm1[0],mem[0],ymm1[2],mem[2]
-; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm10 = mem[2,3,2,3]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
 ; AVX-NEXT:    vbroadcastsd 488(%rsi), %ymm10
-; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm10[2,3]
-; AVX-NEXT:    vmovapd %ymm13, %ymm9
+; AVX-NEXT:    vblendpd {{.*#+}} ymm13 = ymm15[0,1],ymm10[2,3]
+; AVX-NEXT:    vmovapd %ymm15, %ymm9
 ; AVX-NEXT:    vmovaps 480(%rdx), %xmm10
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm13
-; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3]
-; AVX-NEXT:    vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
-; AVX-NEXT:    vmovaps (%r8), %ymm12
-; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = mem[0,1,2,3,4,5],ymm12[6,7]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = ymm12[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = mem[0,1],ymm12[2,3],mem[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm15
+; AVX-NEXT:    vblendpd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3]
+; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX-NEXT:    vmovaps (%r8), %ymm13
+; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm15 = mem[0,1,2,3,4,5],ymm13[6,7]
+; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm15 = ymm13[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm15 = mem[0,1],ymm13[2,3],mem[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5],ymm8[6,7]
 ; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3]
 ; AVX-NEXT:    vmovapd 48(%rdx), %xmm8
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[3]
-; AVX-NEXT:    vmovapd 48(%rsi), %xmm12
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm8[1]
-; AVX-NEXT:    vbroadcastsd 56(%rcx), %ymm12
-; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3]
-; AVX-NEXT:    vmovapd 32(%r8), %ymm12
-; AVX-NEXT:    vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = ymm12[0],mem[1,2,3]
-; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm13 = mem[0],ymm12[1],mem[2,3]
-; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3]
-; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm12[3]
-; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[3]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm8 = mem[0],xmm8[1]
+; AVX-NEXT:    vbroadcastsd 56(%rcx), %ymm13
+; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3]
+; AVX-NEXT:    vmovapd 32(%r8), %ymm13
+; AVX-NEXT:    vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm15 = ymm13[0],mem[1,2,3]
+; AVX-NEXT:    vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm15 = mem[0],ymm13[1],mem[2,3]
+; AVX-NEXT:    vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3]
+; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm8[0,1,2],ymm13[3]
+; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0]
+; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX-NEXT:    vmovaps 64(%r8), %ymm7
 ; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm8 = mem[0,1,2,3,4,5],ymm7[6,7]
@@ -7712,17 +7704,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3]
 ; AVX-NEXT:    vmovapd 112(%rdx), %xmm7
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3]
-; AVX-NEXT:    vmovapd 112(%rsi), %xmm8
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm7 = mem[0],xmm7[1]
 ; AVX-NEXT:    vbroadcastsd 120(%rcx), %ymm8
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3]
 ; AVX-NEXT:    vmovapd 96(%r8), %ymm8
-; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = mem[0],ymm8[1],mem[2,3]
-; AVX-NEXT:    vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm12 = ymm8[0],mem[1,2,3]
-; AVX-NEXT:    vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm13 = mem[0],ymm8[1],mem[2,3]
+; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm13 = ymm8[0],mem[1,2,3]
+; AVX-NEXT:    vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3]
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3]
@@ -7746,8 +7737,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3]
 ; AVX-NEXT:    vmovapd 176(%rdx), %xmm6
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3]
-; AVX-NEXT:    vmovapd 176(%rsi), %xmm7
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
 ; AVX-NEXT:    vbroadcastsd 184(%rcx), %ymm7
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovapd 160(%r8), %ymm7
@@ -7780,8 +7770,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3]
 ; AVX-NEXT:    vmovapd 240(%rdx), %xmm5
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[3]
-; AVX-NEXT:    vmovapd 240(%rsi), %xmm6
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm5 = mem[0],xmm5[1]
 ; AVX-NEXT:    vbroadcastsd 248(%rcx), %ymm6
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3]
 ; AVX-NEXT:    vmovapd 224(%r8), %ymm6
@@ -7810,11 +7799,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm14[2,3]
+; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3]
 ; AVX-NEXT:    vmovapd 304(%rdx), %xmm4
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3]
-; AVX-NEXT:    vmovapd 304(%rsi), %xmm5
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
 ; AVX-NEXT:    vbroadcastsd 312(%rcx), %ymm5
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovapd 288(%r8), %ymm5
@@ -7847,8 +7836,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3]
 ; AVX-NEXT:    vmovapd 368(%rdx), %xmm3
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3]
-; AVX-NEXT:    vmovapd 368(%rsi), %xmm4
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
 ; AVX-NEXT:    vbroadcastsd 376(%rcx), %ymm4
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovapd 352(%r8), %ymm4
@@ -7878,11 +7866,10 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3]
 ; AVX-NEXT:    vmovapd 432(%rdx), %xmm2
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3]
-; AVX-NEXT:    vmovapd 432(%rsi), %xmm3
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
 ; AVX-NEXT:    vbroadcastsd 440(%rcx), %ymm3
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovapd 416(%r8), %ymm3
@@ -7892,19 +7879,18 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm4 = mem[0],ymm3[1],mem[2,3]
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3]
-; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3]
+; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovaps 448(%r8), %ymm1
 ; AVX-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
 ; AVX-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm12[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7913,8 +7899,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm9[2,3]
 ; AVX-NEXT:    vmovapd 496(%rdx), %xmm1
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
-; AVX-NEXT:    vmovapd 496(%rsi), %xmm2
-; AVX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
 ; AVX-NEXT:    vbroadcastsd 504(%rcx), %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovapd 480(%r8), %ymm2
@@ -7929,9 +7914,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -7951,8 +7936,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm10[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = xmm15[0],mem[0]
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm14[0],mem[0]
 ; AVX-NEXT:    vmovaps 256(%rdi), %xmm0
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7974,9 +7959,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm0
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 96(%rdi), %xmm0
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8041,11 +8026,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovaps %xmm0, 816(%r9)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 800(%r9)
-; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 496(%r9)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 480(%r9)
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 176(%r9)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 160(%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index d25f8cf6b0bca..e2df674111de9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -1139,7 +1139,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255]
 ; AVX-NEXT:    vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero
 ; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
@@ -4839,162 +4839,161 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512-FCP-LABEL: store_i8_stride5_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm20
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[u],zero,xmm1[u,10],zero,xmm1[12],zero,xmm1[u,11]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm28
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9,u,11,u],zero,xmm1[10],zero,xmm1[12,u],zero
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm29
-; AVX512-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm21
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm21
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[8],zero,xmm0[u,7],zero,xmm0[9],zero,xmm0[u],zero,xmm0[u,10],zero,xmm0[12],zero,xmm0[u,11]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm29
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm0[8,u],zero,xmm0[7],zero,xmm0[9,u,11,u],zero,xmm0[10],zero,xmm0[12,u],zero
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm30
+; AVX512-FCP-NEXT:    vporq %xmm2, %xmm3, %xmm22
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm7
-; AVX512-FCP-NEXT:    vporq %ymm6, %ymm7, %ymm22
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm1[6],zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9],zero,xmm1[11,u],zero,xmm1[10],zero,xmm1[12]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm30
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[6],zero,xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[11],zero,xmm1[u,10],zero,xmm1[12],zero
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm31
-; AVX512-FCP-NEXT:    vporq %xmm6, %xmm7, %xmm23
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm4
-; AVX512-FCP-NEXT:    vporq %ymm7, %ymm4, %ymm24
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
-; AVX512-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm4
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
-; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm9
-; AVX512-FCP-NEXT:    vporq %ymm4, %ymm9, %ymm25
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm13
-; AVX512-FCP-NEXT:    vporq %ymm4, %ymm13, %ymm26
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
-; AVX512-FCP-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm7
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
-; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm13
-; AVX512-FCP-NEXT:    vporq %ymm7, %ymm13, %ymm27
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
-; AVX512-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm6
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm7
+; AVX512-FCP-NEXT:    vporq %ymm4, %ymm7, %ymm23
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm14
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm14[6],zero,xmm14[8,u],zero,xmm14[7],zero,xmm14[9],zero,xmm14[11,u],zero,xmm14[10],zero,xmm14[12]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm31
+; AVX512-FCP-NEXT:    vporq %xmm4, %xmm7, %xmm24
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm5
+; AVX512-FCP-NEXT:    vporq %ymm3, %ymm5, %ymm25
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm7
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm8, %ymm10
+; AVX512-FCP-NEXT:    vporq %ymm7, %ymm10, %ymm26
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm7, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
+; AVX512-FCP-NEXT:    vporq %ymm10, %ymm9, %ymm27
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero,ymm12[25],zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm7[21,u],zero,ymm7[20],zero,ymm7[22],zero,ymm7[24,u],zero,ymm7[23],zero,ymm7[25,u]
+; AVX512-FCP-NEXT:    vporq %ymm9, %ymm10, %ymm28
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
+; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm0
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
+; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm15
 ; AVX512-FCP-NEXT:    vporq %ymm0, %ymm15, %ymm16
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm3
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm3, %ymm17
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
-; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm3
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
-; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm8
-; AVX512-FCP-NEXT:    vporq %ymm3, %ymm8, %ymm19
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm3
-; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm18
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm2
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm17
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
+; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm5
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm5, %ymm19
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[5],zero,zero,ymm1[4],zero,ymm1[6],zero,ymm1[8],zero,zero,ymm1[7],zero,ymm1[9],zero,zero,zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm11[3],zero,ymm11[5,u],zero,ymm11[4],zero,ymm11[6],zero,ymm11[8,u],zero,ymm11[7],zero,ymm11[9,u,19],zero,ymm11[21,u],zero,ymm11[20],zero,ymm11[22],zero,ymm11[24,u],zero,ymm11[23],zero,ymm11[25,u]
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm18
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm20
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm1
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9,u,11,u],zero,xmm5[10],zero,xmm5[12,u],zero
-; AVX512-FCP-NEXT:    vpor %xmm4, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm7
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm3[6],zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9],zero,xmm3[11,u],zero,xmm3[10],zero,xmm3[12]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[6],zero,xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[11],zero,xmm4[u,10],zero,xmm4[12],zero
-; AVX512-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm10
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,1,2,2,2,2,2,2]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm11, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX512-FCP-NEXT:    vpandn %ymm11, %ymm12, %ymm11
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm9
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7]
-; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm13, %ymm11
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm11
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
-; AVX512-FCP-NEXT:    vpermd %zmm7, %zmm13, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm13
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [2,2,3,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm14, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm15
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm14, %zmm1
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm4, %zmm2
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9,u,11,u],zero,xmm3[10],zero,xmm3[12,u],zero
+; AVX512-FCP-NEXT:    vpor %xmm1, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm5[6],zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9],zero,xmm5[11,u],zero,xmm5[10],zero,xmm5[12]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm7
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm7
+; AVX512-FCP-NEXT:    vpor %xmm1, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm8
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[12],zero,zero,zero,zero,ymm8[13],zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,ymm8[18],zero
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [4,0,5,5,5,5,0,6]
+; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm11 = [2,2,3,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm11, %zmm20
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm12
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm11, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [6,6,6,0,7,7,7,7]
+; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,0,1,1,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm5, %zmm3
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
 ; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm2
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
 ; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm4, %zmm4
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm21[0,0,1,1]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm23[0,0,1,1]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm5, %zmm5
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm22[0,0,1,1]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm24[0,0,1,1]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm5, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm5 & zmm12)
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm24, %zmm3
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm27[2,2,3,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm26, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3))
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [2,2,3,3,10,10,11,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm3, %zmm17
-; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm3, %zmm18
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm3 & (zmm18 ^ zmm17))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm3 & (zmm1 ^ zmm0))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm5 & mem)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm18))
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [9,9,10,10,10,10,10,10]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [11,11,11,11,0,12,12,12]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm7, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm5))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm26[2,2,3,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm25, %zmm5
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm28[2,2,3,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm27, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm6 & (zmm7 ^ zmm5))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm7 & mem)
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,3,3,10,10,11,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm5, %zmm17
+; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm5, %zmm18
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm17))
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [12,14,13,13,13,13,12,14]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [14,14,14,14,15,15,15,15]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm7, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm18))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm5 & (zmm0 ^ zmm20))
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [8,8,8,8,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm5, %zmm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm8, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0))
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 256(%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 64(%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 192(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 128(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 256(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 192(%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -5175,162 +5174,161 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512DQ-FCP-LABEL: store_i8_stride5_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[u],zero,xmm1[u,10],zero,xmm1[12],zero,xmm1[u,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9,u,11,u],zero,xmm1[10],zero,xmm1[12,u],zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm29
-; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[8],zero,xmm0[u,7],zero,xmm0[9],zero,xmm0[u],zero,xmm0[u,10],zero,xmm0[12],zero,xmm0[u,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm0[8,u],zero,xmm0[7],zero,xmm0[9,u,11,u],zero,xmm0[10],zero,xmm0[12,u],zero
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm30
+; AVX512DQ-FCP-NEXT:    vporq %xmm2, %xmm3, %xmm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm7
-; AVX512DQ-FCP-NEXT:    vporq %ymm6, %ymm7, %ymm22
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm1[6],zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9],zero,xmm1[11,u],zero,xmm1[10],zero,xmm1[12]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[6],zero,xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[11],zero,xmm1[u,10],zero,xmm1[12],zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm31
-; AVX512DQ-FCP-NEXT:    vporq %xmm6, %xmm7, %xmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm4
-; AVX512DQ-FCP-NEXT:    vporq %ymm7, %ymm4, %ymm24
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
-; AVX512DQ-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
-; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm9
-; AVX512DQ-FCP-NEXT:    vporq %ymm4, %ymm9, %ymm25
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm13
-; AVX512DQ-FCP-NEXT:    vporq %ymm4, %ymm13, %ymm26
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
-; AVX512DQ-FCP-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm7
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
-; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm13
-; AVX512DQ-FCP-NEXT:    vporq %ymm7, %ymm13, %ymm27
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
-; AVX512DQ-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT:    vporq %ymm4, %ymm7, %ymm23
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm14[6],zero,xmm14[8,u],zero,xmm14[7],zero,xmm14[9],zero,xmm14[11,u],zero,xmm14[10],zero,xmm14[12]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm31
+; AVX512DQ-FCP-NEXT:    vporq %xmm4, %xmm7, %xmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm5
+; AVX512DQ-FCP-NEXT:    vporq %ymm3, %ymm5, %ymm25
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm7
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm8, %ymm10
+; AVX512DQ-FCP-NEXT:    vporq %ymm7, %ymm10, %ymm26
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm7, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
+; AVX512DQ-FCP-NEXT:    vporq %ymm10, %ymm9, %ymm27
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero,ymm12[25],zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm7[21,u],zero,ymm7[20],zero,ymm7[22],zero,ymm7[24,u],zero,ymm7[23],zero,ymm7[25,u]
+; AVX512DQ-FCP-NEXT:    vporq %ymm9, %ymm10, %ymm28
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
+; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
+; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm15
 ; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm15, %ymm16
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm3, %ymm17
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
-; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm8
-; AVX512DQ-FCP-NEXT:    vporq %ymm3, %ymm8, %ymm19
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm18
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm2
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm17
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
+; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm5
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm5, %ymm19
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[5],zero,zero,ymm1[4],zero,ymm1[6],zero,ymm1[8],zero,zero,ymm1[7],zero,ymm1[9],zero,zero,zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm11[3],zero,ymm11[5,u],zero,ymm11[4],zero,ymm11[6],zero,ymm11[8,u],zero,ymm11[7],zero,ymm11[9,u,19],zero,ymm11[21,u],zero,ymm11[20],zero,ymm11[22],zero,ymm11[24,u],zero,ymm11[23],zero,ymm11[25,u]
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm18
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm20
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm9, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9,u,11,u],zero,xmm5[10],zero,xmm5[12,u],zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm3[6],zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9],zero,xmm3[11,u],zero,xmm3[10],zero,xmm3[12]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[6],zero,xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[11],zero,xmm4[u,10],zero,xmm4[12],zero
-; AVX512DQ-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm10
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,1,2,2,2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX512DQ-FCP-NEXT:    vpandn %ymm11, %ymm12, %ymm11
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm9
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm13 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm13, %ymm11
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
-; AVX512DQ-FCP-NEXT:    vpermd %zmm7, %zmm13, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm13
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [2,2,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm14, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm15
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm14, %zmm1
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm4, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9,u,11,u],zero,xmm3[10],zero,xmm3[12,u],zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm5[6],zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9],zero,xmm5[11,u],zero,xmm5[10],zero,xmm5[12]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm7
+; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[12],zero,zero,zero,zero,ymm8[13],zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,ymm8[18],zero
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [4,0,5,5,5,5,0,6]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm11 = [2,2,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm11, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm12
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [6,6,6,0,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,0,1,1,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm5, %zmm3
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
 ; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm4, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm21[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm23[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm22[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm24[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm5, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm5 & zmm12)
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm24, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm27[2,2,3,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm26, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3))
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [2,2,3,3,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm3, %zmm17
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm3, %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm3 & (zmm18 ^ zmm17))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm3 & (zmm1 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm5 & mem)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm18))
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [9,9,10,10,10,10,10,10]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [11,11,11,11,0,12,12,12]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm26[2,2,3,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm25, %zmm5
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm28[2,2,3,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm27, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm6 & (zmm7 ^ zmm5))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm7 & mem)
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,3,3,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm5, %zmm17
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm5, %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm17))
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [12,14,13,13,13,13,12,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [14,14,14,14,15,15,15,15]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm18))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm5 & (zmm0 ^ zmm20))
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [8,8,8,8,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm8, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 256(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 64(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 192(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 128(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 256(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 192(%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -5423,7 +5421,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm23 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
 ; AVX512BW-NEXT:    # ymm23 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %ymm23, %ymm28, %ymm29
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512BW-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %ymm24, %ymm27, %ymm30
 ; AVX512BW-NEXT:    vporq %ymm29, %ymm30, %ymm29
 ; AVX512BW-NEXT:    vpshufb %ymm18, %ymm28, %ymm18
@@ -5432,7 +5431,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovdqu8 %ymm27, %ymm18 {%k5}
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm18, %zmm29, %zmm18
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm18 = zmm18[2,2,3,3,6,6,7,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512BW-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %ymm27, %ymm25, %ymm28
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm29 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
 ; AVX512BW-NEXT:    # ymm29 = mem[0,1,2,3,0,1,2,3]
@@ -5504,12 +5504,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-LABEL: store_i8_stride5_vf64:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm21
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm21
+; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm21, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm21, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm2
 ; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm11
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
@@ -5517,77 +5517,82 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm3
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm12
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm16, %xmm4, %xmm5
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm5
 ; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm3, %xmm9
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm17
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm3, %xmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm16
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm5, %xmm10
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm18, %xmm5, %xmm10
 ; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1]
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm18
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm9[0,0,1,1]
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm20
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
-; AVX512BW-FCP-NEXT:    vpermd %ymm18, %ymm9, %ymm22
+; AVX512BW-FCP-NEXT:    vpermd %ymm20, %ymm9, %ymm22
 ; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm23
 ; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX512BW-FCP-NEXT:    movl $138547332, %eax # imm = 0x8421084
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm23, %ymm22 {%k1}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm15, %zmm15
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm19, %zmm19
 ; AVX512BW-FCP-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12]
-; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm15, %zmm15
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm1 {%k2}
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [9,9,10,10,10,10,10,10]
+; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm19
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [11,11,11,11,0,12,12,12]
+; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm22
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm19, %zmm19
 ; AVX512BW-FCP-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k3}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm1 {%k3}
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm15, %zmm22, %zmm22
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm18[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm18, %zmm23, %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm19, %zmm22, %zmm22
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm20[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm20, %zmm23, %zmm23
 ; AVX512BW-FCP-NEXT:    vporq %zmm22, %zmm23, %zmm22
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7]
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm23, %zmm8, %zmm8
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm24, %zmm21, %zmm21
-; AVX512BW-FCP-NEXT:    vporq %zmm8, %zmm21, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm23, %zmm21, %zmm21
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm24, %zmm8, %zmm8
+; AVX512BW-FCP-NEXT:    vporq %zmm21, %zmm8, %zmm8
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7]
 ; AVX512BW-FCP-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm8 {%k3}
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %ymm21
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7]
-; AVX512BW-FCP-NEXT:    vpermd %zmm21, %zmm22, %zmm21
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [12,14,13,13,13,13,12,14]
+; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm21
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [14,14,14,14,15,15,15,15]
+; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm22
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm21, %zmm21
 ; AVX512BW-FCP-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k4
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm21, %zmm8 {%k4}
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb %xmm16, %xmm12, %xmm16
-; AVX512BW-FCP-NEXT:    vporq %xmm14, %xmm16, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
+; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm11, %zmm11
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm13, %xmm11
-; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm17, %xmm16
-; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm16, %xmm11
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3],xmm13[4],xmm17[4],xmm13[5],xmm17[5],xmm13[6],xmm17[6],xmm13[7],xmm17[7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm16, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm13, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb %xmm18, %xmm16, %xmm15
+; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm15, %xmm11
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm16[0],xmm13[1],xmm16[1],xmm13[2],xmm16[2],xmm13[3],xmm16[3],xmm13[4],xmm16[4],xmm13[5],xmm16[5],xmm13[6],xmm16[6],xmm13[7],xmm16[7]
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm13, %zmm11
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5]
 ; AVX512BW-FCP-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
@@ -5604,19 +5609,19 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm14, %ymm7
 ; AVX512BW-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
 ; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm13, %ymm7
-; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm14, %ymm17
-; AVX512BW-FCP-NEXT:    vporq %ymm7, %ymm17, %ymm7
+; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm14, %ymm16
+; AVX512BW-FCP-NEXT:    vporq %ymm7, %ymm16, %ymm7
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm7
-; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm15
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm7, %ymm16
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm17
-; AVX512BW-FCP-NEXT:    vpshufb %ymm18, %ymm17, %ymm18
-; AVX512BW-FCP-NEXT:    vporq %ymm15, %ymm18, %ymm15
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm17, %ymm18
+; AVX512BW-FCP-NEXT:    vporq %ymm16, %ymm18, %ymm16
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3]
 ; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm9, %ymm9
 ; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm9 {%k1}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm9, %zmm9
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm9, %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm9 {%k2}
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6]
 ; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm6, %zmm6
@@ -5631,14 +5636,17 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm4
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm16, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm17[26],zero,ymm17[28],zero,zero,ymm17[27],zero,ymm17[29],zero,ymm17[31],zero,zero,ymm17[30],zero
 ; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k3}
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9]
-; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [6,6,6,6,7,7,7,7]
+; AVX512BW-FCP-NEXT:    vpermd (%r8), %ymm2, %ymm2
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,8,8,8,8,8,9,9]
+; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512BW-FCP-NEXT:    movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
@@ -5739,7 +5747,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm23 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
 ; AVX512DQ-BW-NEXT:    # ymm23 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpshufb %ymm23, %ymm28, %ymm29
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512DQ-BW-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpshufb %ymm24, %ymm27, %ymm30
 ; AVX512DQ-BW-NEXT:    vporq %ymm29, %ymm30, %ymm29
 ; AVX512DQ-BW-NEXT:    vpshufb %ymm18, %ymm28, %ymm18
@@ -5748,7 +5757,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm27, %ymm18 {%k5}
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm18, %zmm29, %zmm18
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm18 = zmm18[2,2,3,3,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512DQ-BW-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpshufb %ymm27, %ymm25, %ymm28
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm29 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
 ; AVX512DQ-BW-NEXT:    # ymm29 = mem[0,1,2,3,0,1,2,3]
@@ -5820,12 +5830,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm21
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm21, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm21, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm2
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
@@ -5833,77 +5843,82 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm12
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm16, %xmm4, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm3, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm3, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm5, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm18, %xmm5, %xmm10
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm9[0,0,1,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm20
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm18, %ymm9, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm20, %ymm9, %ymm22
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm23
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX512DQ-BW-FCP-NEXT:    movl $138547332, %eax # imm = 0x8421084
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm23, %ymm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm15, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm19, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm15, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm1 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [9,9,10,10,10,10,10,10]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [11,11,11,11,0,12,12,12]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm19, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm1 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm15, %zmm22, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm18[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm18, %zmm23, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm19, %zmm22, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm20[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm20, %zmm23, %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vporq %zmm22, %zmm23, %zmm22
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7]
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm23, %zmm8, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm24, %zmm21, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vporq %zmm8, %zmm21, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm23, %zmm21, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm24, %zmm8, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vporq %zmm21, %zmm8, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7]
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm8 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %ymm21
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm21, %zmm22, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [12,14,13,13,13,13,12,14]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [14,14,14,14,15,15,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm21, %zmm21
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm21, %zmm8 {%k4}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm16, %xmm12, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm14, %xmm16, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm11, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm13, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm17, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm16, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3],xmm13[4],xmm17[4],xmm13[5],xmm17[5],xmm13[6],xmm17[6],xmm13[7],xmm17[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm16, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm13, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm18, %xmm16, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm15, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm16[0],xmm13[1],xmm16[1],xmm13[2],xmm16[2],xmm13[3],xmm16[3],xmm13[4],xmm16[4],xmm13[5],xmm16[5],xmm13[6],xmm16[6],xmm13[7],xmm16[7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm13, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5]
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
@@ -5920,19 +5935,19 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm14, %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm13, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm14, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm7, %ymm17, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm14, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm7, %ymm16, %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm7, %ymm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm18, %ymm17, %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm15, %ymm18, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm17, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm16, %ymm18, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm9, %ymm9
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm9 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm9, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm9, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm9 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6]
 ; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm6, %zmm6
@@ -5947,14 +5962,17 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm16, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm17[26],zero,ymm17[28],zero,zero,ymm17[27],zero,ymm17[29],zero,ymm17[31],zero,zero,ymm17[30],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [6,6,6,6,7,7,7,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermd (%r8), %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,8,8,8,8,8,9,9]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index 6205be83f5123..a08bb851c603d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -2014,13 +2014,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa 16(%rsi), %xmm0
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm11[3,3,3,3]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3]
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX-NEXT:    vandnps %ymm1, %ymm10, %ymm2
+; AVX-NEXT:    vmovaps {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX-NEXT:    vandnps %ymm1, %ymm9, %ymm2
 ; AVX-NEXT:    vmovdqa 16(%rcx), %xmm1
 ; AVX-NEXT:    vmovdqa 16(%rdx), %xmm3
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
@@ -2030,47 +2030,47 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX-NEXT:    vandps %ymm3, %ymm10, %ymm3
+; AVX-NEXT:    vandps %ymm3, %ymm9, %ymm3
 ; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX-NEXT:    vmovdqa 16(%r8), %xmm12
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm12[8,u],zero,zero,zero,zero,xmm12[9,u],zero,zero,zero,zero
+; AVX-NEXT:    vmovdqa 16(%r8), %xmm11
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm11[8,u],zero,zero,zero,zero,xmm11[9,u],zero,zero,zero,zero
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10],zero,xmm3[12,13,14,15]
 ; AVX-NEXT:    vmovdqa 16(%r9), %xmm13
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm13[8],zero,zero,zero,zero,zero,xmm13[9],zero,zero,zero,zero
 ; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm12[5,u],zero,zero,zero,zero,xmm12[6,u],zero,zero,zero,zero,xmm12[7,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm11[5,u],zero,zero,zero,zero,xmm11[6,u],zero,zero,zero,zero,xmm11[7,u]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
-; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm13[5],zero,zero,zero,zero,zero,xmm13[6],zero,zero,zero,zero,zero,xmm13[7]
 ; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX-NEXT:    vandps %ymm0, %ymm10, %ymm0
+; AVX-NEXT:    vandps %ymm0, %ymm9, %ymm0
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT:    vandnps %ymm1, %ymm10, %ymm1
+; AVX-NEXT:    vandnps %ymm1, %ymm9, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm12[10,u],zero,zero,zero,zero,xmm12[11,u],zero,zero,zero,zero,xmm12[12,u],zero,zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[10,u],zero,zero,zero,zero,xmm11[11,u],zero,zero,zero,zero,xmm11[12,u],zero,zero
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm13[10],zero,zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero,zero,xmm13[12],zero,zero
 ; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
-; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm11, %xmm1
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm13[13],zero,zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,zero,xmm13[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15]
+; AVX-NEXT:    vpshufb %xmm10, %xmm13, %xmm1
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa (%rsi), %xmm8
@@ -2087,51 +2087,53 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,5,6,7,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[2,2,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm1, %ymm1
-; AVX-NEXT:    vandps %ymm0, %ymm10, %ymm0
-; AVX-NEXT:    vandnps %ymm1, %ymm10, %ymm1
+; AVX-NEXT:    vandps %ymm0, %ymm9, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm9, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm15
 ; AVX-NEXT:    vmovdqa (%r8), %xmm1
 ; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
 ; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm2
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
-; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm2
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero
 ; AVX-NEXT:    vmovdqa (%r9), %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15]
+; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm9
 ; AVX-NEXT:    vpor %xmm2, %xmm9, %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1,2],xmm2[3],xmm15[4,5],xmm2[6],xmm15[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6],zero,xmm2[8,9,10,11,12],zero,xmm2[14,15]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero
 ; AVX-NEXT:    vpor %xmm2, %xmm9, %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm11[0,0,1,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[0,0,1,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2]
 ; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm2, %ymm9
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[1,0,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm14[3,3,3,3,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm2, %ymm11
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm14[3,3,3,3,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4]
+; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm2, %ymm10
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX-NEXT:    vandps %ymm2, %ymm9, %ymm9
-; AVX-NEXT:    vandnps %ymm11, %ymm2, %ymm11
-; AVX-NEXT:    vorps %ymm11, %ymm9, %ymm15
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm12[0,u],zero,zero,zero,zero,xmm12[1,u],zero,zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero
-; AVX-NEXT:    vpor %xmm10, %xmm11, %xmm11
-; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm10
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128]
-; AVX-NEXT:    vpshufb %xmm15, %xmm12, %xmm12
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
-; AVX-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
-; AVX-NEXT:    vpshufb %xmm12, %xmm13, %xmm13
-; AVX-NEXT:    vpor %xmm13, %xmm10, %xmm10
+; AVX-NEXT:    vandnps %ymm10, %ymm2, %ymm10
+; AVX-NEXT:    vorps %ymm10, %ymm9, %ymm9
+; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm11[0,u],zero,zero,zero,zero,xmm11[1,u],zero,zero,zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX-NEXT:    vpand %xmm14, %xmm12, %xmm12
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128]
+; AVX-NEXT:    vpshufb %xmm15, %xmm13, %xmm10
+; AVX-NEXT:    vpor %xmm10, %xmm12, %xmm10
+; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm9
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128]
+; AVX-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1,2],xmm11[3],xmm9[4,5],xmm11[6],xmm9[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
+; AVX-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
+; AVX-NEXT:    vpshufb %xmm10, %xmm13, %xmm13
+; AVX-NEXT:    vpor %xmm13, %xmm9, %xmm9
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm6[1,1,2,2]
@@ -2145,16 +2147,16 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vandps %ymm2, %ymm8, %ymm8
 ; AVX-NEXT:    vandnps %ymm7, %ymm2, %ymm2
 ; AVX-NEXT:    vorps %ymm2, %ymm8, %ymm7
-; AVX-NEXT:    vpshufb %xmm15, %xmm1, %xmm2
+; AVX-NEXT:    vpshufb %xmm12, %xmm1, %xmm2
 ; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm8
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7]
-; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm12, %xmm0, %xmm8
+; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm8
 ; AVX-NEXT:    vpor %xmm2, %xmm8, %xmm2
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
-; AVX-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
-; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero
+; AVX-NEXT:    vpand %xmm7, %xmm14, %xmm7
+; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm8
 ; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
@@ -2170,13 +2172,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7,8],zero,xmm4[10,11,12,13,14],zero
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7]
 ; AVX-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero
 ; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4],zero,xmm1[6,7,8,9,10],zero,xmm1[12,13,14,15]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -2184,8 +2186,9 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vmovdqa %xmm4, 32(%rax)
 ; AVX-NEXT:    vmovdqa %xmm7, (%rax)
 ; AVX-NEXT:    vmovdqa %xmm2, 16(%rax)
-; AVX-NEXT:    vmovdqa %xmm10, 112(%rax)
-; AVX-NEXT:    vmovdqa %xmm11, 96(%rax)
+; AVX-NEXT:    vmovdqa %xmm9, 112(%rax)
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vmovaps %xmm0, 96(%rax)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 64(%rax)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -3956,7 +3959,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX-LABEL: store_i8_stride6_vf64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $200, %rsp
+; AVX-NEXT:    subq $184, %rsp
 ; AVX-NEXT:    vmovdqa 48(%rsi), %xmm1
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -3971,174 +3974,172 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,6,7,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3]
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm4[1,0,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; AVX-NEXT:    vandps %ymm5, %ymm10, %ymm5
-; AVX-NEXT:    vorps %ymm2, %ymm5, %ymm6
-; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm5
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[1,0,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX-NEXT:    vandps %ymm4, %ymm10, %ymm4
+; AVX-NEXT:    vorps %ymm2, %ymm4, %ymm6
+; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm4
 ; AVX-NEXT:    vmovdqa 48(%r8), %xmm2
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[0,1,2,3,4],zero,xmm5[6,7,8,9,10],zero,xmm5[12,13,14,15]
-; AVX-NEXT:    vmovdqa 48(%r9), %xmm5
-; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10],zero,xmm4[12,13,14,15]
+; AVX-NEXT:    vmovdqa 48(%r9), %xmm4
+; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm4[8],zero,zero,zero,zero,zero,xmm4[9],zero,zero,zero,zero
 ; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
-; AVX-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
-; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5],zero,zero,zero,zero,zero,xmm4[6],zero,zero,zero,zero,zero,xmm4[7]
 ; AVX-NEXT:    vpor %xmm7, %xmm6, %xmm6
 ; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,2]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
 ; AVX-NEXT:    vandps %ymm1, %ymm10, %ymm1
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
-; AVX-NEXT:    vandnps %ymm4, %ymm10, %ymm4
-; AVX-NEXT:    vorps %ymm4, %ymm1, %ymm1
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
-; AVX-NEXT:    vpshufb %xmm15, %xmm2, %xmm6
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6],xmm6[7]
-; AVX-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15]
-; AVX-NEXT:    vpor %xmm6, %xmm4, %xmm4
-; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128]
-; AVX-NEXT:    vpshufb %xmm13, %xmm5, %xmm4
-; AVX-NEXT:    vpor %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX-NEXT:    vandnps %ymm5, %ymm10, %ymm5
+; AVX-NEXT:    vorps %ymm5, %ymm1, %ymm1
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
+; AVX-NEXT:    vpshufb %xmm15, %xmm5, %xmm5
+; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm4[13],zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,xmm4[15]
+; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128]
+; AVX-NEXT:    vpshufb %xmm14, %xmm4, %xmm5
+; AVX-NEXT:    vpor %xmm5, %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 32(%rsi), %xmm6
 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[3,3,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,2]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
 ; AVX-NEXT:    vmovdqa 32(%rcx), %xmm12
-; AVX-NEXT:    vmovdqa 32(%rdx), %xmm14
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,5,6,7,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX-NEXT:    vandps %ymm1, %ymm10, %ymm1
-; AVX-NEXT:    vandnps %ymm8, %ymm10, %ymm8
-; AVX-NEXT:    vorps %ymm1, %ymm8, %ymm9
-; AVX-NEXT:    vmovdqa 32(%r8), %xmm1
-; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm8
-; AVX-NEXT:    vpshufb %xmm15, %xmm1, %xmm10
-; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[0,1,2],zero,xmm8[4,5,6,7,8],zero,xmm8[10,11,12,13,14],zero
-; AVX-NEXT:    vmovdqa 32(%r9), %xmm8
-; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm8[13],zero,zero,zero,zero,zero,xmm8[14],zero,zero,zero,zero,zero,xmm8[15]
+; AVX-NEXT:    vmovdqa 32(%rdx), %xmm13
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,5,6,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm7, %ymm7
+; AVX-NEXT:    vandps %ymm5, %ymm10, %ymm5
+; AVX-NEXT:    vandnps %ymm7, %ymm10, %ymm7
+; AVX-NEXT:    vorps %ymm7, %ymm5, %ymm8
+; AVX-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX-NEXT:    vextractf128 $1, %ymm8, %xmm7
+; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm5[13,u],zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,xmm5[15,u]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6],xmm10[7]
+; AVX-NEXT:    vpshufb %xmm15, %xmm7, %xmm10
+; AVX-NEXT:    vmovdqa 32(%r9), %xmm7
+; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[13],zero,zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,zero,xmm7[15]
 ; AVX-NEXT:    vpor %xmm15, %xmm10, %xmm10
 ; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
-; AVX-NEXT:    vpshufb %xmm15, %xmm9, %xmm9
-; AVX-NEXT:    vpshufb %xmm13, %xmm8, %xmm10
-; AVX-NEXT:    vpor %xmm10, %xmm9, %xmm9
-; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[10,u],zero,zero,zero,zero,xmm5[11,u],zero,zero,zero,zero,xmm5[12,u],zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1,2],xmm10[3],xmm8[4,5],xmm10[6],xmm8[7]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8, %xmm8
+; AVX-NEXT:    vpshufb %xmm14, %xmm7, %xmm10
+; AVX-NEXT:    vpor %xmm10, %xmm8, %xmm8
+; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm9, %ymm0
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm3[1,0,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm0
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[1,0,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm9, %ymm3
-; AVX-NEXT:    vmovaps {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX-NEXT:    vandps %ymm0, %ymm9, %ymm0
-; AVX-NEXT:    vandnps %ymm3, %ymm9, %ymm3
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm8, %ymm3
+; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX-NEXT:    vandps %ymm0, %ymm8, %ymm0
+; AVX-NEXT:    vandnps %ymm3, %ymm8, %ymm3
 ; AVX-NEXT:    vorps %ymm3, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1,2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7]
-; AVX-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
-; AVX-NEXT:    vpshufb %xmm13, %xmm5, %xmm9
-; AVX-NEXT:    vpor %xmm3, %xmm9, %xmm3
+; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2],xmm8[3],xmm3[4,5],xmm8[6],xmm3[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5,6],zero,xmm3[8,9,10,11,12],zero,xmm3[14,15]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
+; AVX-NEXT:    vpshufb %xmm15, %xmm4, %xmm8
+; AVX-NEXT:    vpor %xmm3, %xmm8, %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128]
-; AVX-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128]
+; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128]
-; AVX-NEXT:    vpshufb %xmm15, %xmm5, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX-NEXT:    vpand %xmm0, %xmm10, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128]
+; AVX-NEXT:    vpshufb %xmm8, %xmm4, %xmm2
 ; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[1,0,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[1,0,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX-NEXT:    vandps %ymm6, %ymm2, %ymm2
-; AVX-NEXT:    vandnps %ymm5, %ymm6, %ymm5
-; AVX-NEXT:    vorps %ymm5, %ymm2, %ymm2
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5,6],zero,xmm5[8,9,10,11,12],zero,xmm5[14,15]
-; AVX-NEXT:    vpshufb %xmm13, %xmm8, %xmm6
-; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb %xmm10, %xmm1, %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
-; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm15, %xmm8, %xmm5
-; AVX-NEXT:    vpor %xmm5, %xmm2, %xmm2
+; AVX-NEXT:    vandnps %ymm4, %ymm6, %ymm4
+; AVX-NEXT:    vorps %ymm4, %ymm2, %ymm2
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[2,u],zero,zero,zero,zero,xmm5[3,u],zero,zero,zero,zero,xmm5[4,u],zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0],zero,xmm4[2,3,4,5,6],zero,xmm4[8,9,10,11,12],zero,xmm4[14,15]
+; AVX-NEXT:    vpshufb %xmm15, %xmm7, %xmm6
+; AVX-NEXT:    vpor %xmm6, %xmm4, %xmm4
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb %xmm14, %xmm5, %xmm4
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
+; AVX-NEXT:    vpand %xmm2, %xmm10, %xmm2
+; AVX-NEXT:    vpshufb %xmm8, %xmm7, %xmm4
+; AVX-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,6,7,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT:    vmovaps {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX-NEXT:    vandnps %ymm0, %ymm15, %ymm0
-; AVX-NEXT:    vandps %ymm2, %ymm15, %ymm2
-; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128]
-; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
-; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa %xmm9, %xmm14
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128]
-; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm3
-; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u]
-; AVX-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[1,0,2,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128]
+; AVX-NEXT:    vpshufb %xmm6, %xmm5, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128]
+; AVX-NEXT:    vpshufb %xmm9, %xmm7, %xmm2
+; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u]
+; AVX-NEXT:    vpshufb %xmm12, %xmm5, %xmm1
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
-; AVX-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7]
-; AVX-NEXT:    vpshufb %xmm9, %xmm8, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX-NEXT:    vpand %xmm0, %xmm13, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7]
+; AVX-NEXT:    vpshufb %xmm14, %xmm7, %xmm1
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 16(%rsi), %xmm0
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -4155,22 +4156,23 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT:    vandnps %ymm1, %ymm15, %ymm1
-; AVX-NEXT:    vandps %ymm2, %ymm15, %ymm2
+; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX-NEXT:    vandnps %ymm1, %ymm7, %ymm1
+; AVX-NEXT:    vandps %ymm7, %ymm2, %ymm2
 ; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
 ; AVX-NEXT:    vmovdqa 16(%r8), %xmm3
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX-NEXT:    vpshufb %xmm6, %xmm3, %xmm5
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
-; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm5
+; AVX-NEXT:    vpshufb %xmm8, %xmm2, %xmm5
 ; AVX-NEXT:    vmovdqa 16(%r9), %xmm2
-; AVX-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
+; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm6
 ; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
 ; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb %xmm7, %xmm3, %xmm5
+; AVX-NEXT:    vpshufb %xmm12, %xmm3, %xmm5
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7]
-; AVX-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm5
+; AVX-NEXT:    vpand %xmm1, %xmm13, %xmm1
+; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm5
 ; AVX-NEXT:    vpor %xmm5, %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,2]
@@ -4181,58 +4183,55 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX-NEXT:    vandps %ymm0, %ymm15, %ymm0
-; AVX-NEXT:    vandnps %ymm1, %ymm15, %ymm1
+; AVX-NEXT:    vandps %ymm7, %ymm0, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm7, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
-; AVX-NEXT:    vpshufb %xmm14, %xmm3, %xmm4
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
+; AVX-NEXT:    vpshufb %xmm13, %xmm3, %xmm4
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7]
-; AVX-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15]
-; AVX-NEXT:    vpshufb %xmm15, %xmm2, %xmm4
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
+; AVX-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15]
+; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm4
 ; AVX-NEXT:    vpor %xmm4, %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [10,u,128,128,128,128,11,u,128,128,128,128,12,u,128,128]
-; AVX-NEXT:    vpshufb %xmm13, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[10,u],zero,zero,zero,zero,xmm3[11,u],zero,zero,zero,zero,xmm3[12,u],zero,zero
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
-; AVX-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm2[10],zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,xmm2[12],zero,zero
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm7
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm9[1,1,2,2]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm9[3,3,3,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovdqa (%rcx), %xmm6
-; AVX-NEXT:    vmovdqa (%rdx), %xmm5
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,5,6,7,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,5,6,7,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,2,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm12, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm12
 ; AVX-NEXT:    vmovdqa (%r8), %xmm1
-; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm14
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero
+; AVX-NEXT:    vpshufb %xmm13, %xmm1, %xmm0
+; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm13
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3],xmm0[4],xmm13[5,6],xmm0[7]
+; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm13
 ; AVX-NEXT:    vmovdqa (%r9), %xmm0
-; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm15
-; AVX-NEXT:    vpor %xmm15, %xmm14, %xmm4
-; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb %xmm14, %xmm0, %xmm15
+; AVX-NEXT:    vpor %xmm15, %xmm13, %xmm13
+; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5],xmm15[6],xmm12[7]
-; AVX-NEXT:    vmovdqa %xmm13, %xmm4
-; AVX-NEXT:    vpshufb %xmm13, %xmm12, %xmm12
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12, %xmm12
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero
 ; AVX-NEXT:    vpor %xmm15, %xmm12, %xmm12
 ; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4244,24 +4243,24 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4]
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm15, %ymm10
-; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX-NEXT:    vandps %ymm13, %ymm11, %ymm11
-; AVX-NEXT:    vandnps %ymm10, %ymm13, %ymm10
+; AVX-NEXT:    vmovaps {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX-NEXT:    vandps %ymm14, %ymm11, %ymm11
+; AVX-NEXT:    vandnps %ymm10, %ymm14, %ymm10
 ; AVX-NEXT:    vorps %ymm10, %ymm11, %ymm11
 ; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm10
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128]
 ; AVX-NEXT:    vpshufb %xmm12, %xmm3, %xmm15
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1,2],xmm15[3],xmm10[4,5],xmm15[6],xmm10[7]
-; AVX-NEXT:    vpshufb %xmm4, %xmm10, %xmm10
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
-; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm15
-; AVX-NEXT:    vpor %xmm15, %xmm10, %xmm4
-; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0],zero,xmm10[2,3,4,5,6],zero,xmm10[8,9,10,11,12],zero,xmm10[14,15]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
+; AVX-NEXT:    vpshufb %xmm13, %xmm2, %xmm15
+; AVX-NEXT:    vpor %xmm15, %xmm10, %xmm10
+; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128]
 ; AVX-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX-NEXT:    vpand %xmm3, %xmm11, %xmm3
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128]
 ; AVX-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
 ; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
@@ -4269,33 +4268,32 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[0,0,1,1]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm3, %ymm3
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm6, %ymm6
-; AVX-NEXT:    vandps %ymm3, %ymm13, %ymm3
-; AVX-NEXT:    vandnps %ymm6, %ymm13, %ymm6
+; AVX-NEXT:    vandps %ymm3, %ymm14, %ymm3
+; AVX-NEXT:    vandnps %ymm6, %ymm14, %ymm6
 ; AVX-NEXT:    vorps %ymm6, %ymm3, %ymm6
 ; AVX-NEXT:    vpshufb %xmm12, %xmm1, %xmm3
 ; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm8
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2],xmm3[3],xmm8[4,5],xmm3[6],xmm8[7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5,6],zero,xmm3[8,9,10,11,12],zero,xmm3[14,15]
-; AVX-NEXT:    vpshufb %xmm14, %xmm0, %xmm8
+; AVX-NEXT:    vpshufb %xmm13, %xmm0, %xmm8
 ; AVX-NEXT:    vpor %xmm3, %xmm8, %xmm3
 ; AVX-NEXT:    vpshufb %xmm15, %xmm1, %xmm8
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7]
-; AVX-NEXT:    vpshufb %xmm11, %xmm6, %xmm6
+; AVX-NEXT:    vpand %xmm6, %xmm11, %xmm6
 ; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm8
 ; AVX-NEXT:    vpor %xmm6, %xmm8, %xmm6
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; AVX-NEXT:    vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = mem[0,0,1,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[0,0,1,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm7, %ymm7
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[1,0,2,2,4,5,6,7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,0,2,2,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
@@ -4305,12 +4303,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero
 ; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm7
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4],zero,xmm5[6,7,8,9,10],zero,xmm5[12,13,14,15]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero
 ; AVX-NEXT:    vpor %xmm7, %xmm5, %xmm5
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8],zero,xmm1[10,11,12,13,14],zero
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7]
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -4333,9 +4331,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vmovaps %xmm0, 128(%rax)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 144(%rax)
-; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm0, 224(%rax)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vmovaps %xmm0, 224(%rax)
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 240(%rax)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 192(%rax)
@@ -4357,7 +4355,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vmovaps %xmm0, 320(%rax)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 336(%rax)
-; AVX-NEXT:    addq $200, %rsp
+; AVX-NEXT:    addq $184, %rsp
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 02ec9fc66feab..cf35f47866f29 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -2089,7 +2089,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ~mem)
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm5))
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[6,14,u,u,u],zero,zero,xmm3[7,15,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,14],zero,zero,xmm4[u,u,u,7,15],zero,zero,xmm4[u,u,u]
 ; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
@@ -2156,7 +2156,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ~mem)
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5))
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
@@ -2226,7 +2226,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ~mem)
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm5))
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[6,14,u,u,u],zero,zero,xmm3[7,15,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,14],zero,zero,xmm4[u,u,u,7,15],zero,zero,xmm4[u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
@@ -2293,7 +2293,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ~mem)
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5))
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
@@ -3331,266 +3331,263 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX-LABEL: store_i8_stride7_vf32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $216, %rsp
+; AVX-NEXT:    subq $184, %rsp
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    vmovdqa 16(%rax), %xmm14
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[u,u,u],zero,zero,xmm14[9,u,u,u,u],zero,zero,xmm14[10,u,u,u]
-; AVX-NEXT:    vmovdqa 16(%r9), %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 16(%r9), %xmm10
 ; AVX-NEXT:    vmovdqa 16(%r8), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u]
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u]
-; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
-; AVX-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
-; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u]
-; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u]
-; AVX-NEXT:    vpshufb %xmm5, %xmm14, %xmm3
-; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u]
+; AVX-NEXT:    vpshufb %xmm4, %xmm10, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
+; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm2
+; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u]
+; AVX-NEXT:    vpshufb %xmm11, %xmm14, %xmm2
+; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm7
-; AVX-NEXT:    vmovdqa 16(%rcx), %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdx), %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm3
-; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm15
-; AVX-NEXT:    vmovdqa 16(%rsi), %xmm10
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128]
-; AVX-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
+; AVX-NEXT:    vmovdqa 16(%rcx), %xmm13
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
+; AVX-NEXT:    vpshufb %xmm15, %xmm13, %xmm2
+; AVX-NEXT:    vmovdqa 16(%rdx), %xmm12
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,7],zero,xmm12[u,u,u,u,u,8],zero,xmm12[u,u,u,u]
+; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15]
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm8
+; AVX-NEXT:    vmovdqa 16(%rsi), %xmm9
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u,u],zero
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm6
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9]
-; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
-; AVX-NEXT:    vandnps %ymm15, %ymm2, %ymm15
-; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    vorps %ymm0, %ymm15, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
-; AVX-NEXT:    vandnps %ymm7, %ymm2, %ymm7
-; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9]
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
+; AVX-NEXT:    vandnps %ymm8, %ymm1, %ymm8
+; AVX-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vorps %ymm0, %ymm8, %ymm0
+; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
+; AVX-NEXT:    vandnps %ymm7, %ymm1, %ymm7
+; AVX-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vorps %ymm7, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa (%r9), %xmm7
-; AVX-NEXT:    vpshufb %xmm11, %xmm7, %xmm0
-; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa (%r9), %xmm2
+; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm0
+; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa (%r8), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb %xmm12, %xmm3, %xmm2
-; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa (%rax), %xmm8
-; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb %xmm5, %xmm8, %xmm2
-; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128]
-; AVX-NEXT:    vpshufb %xmm5, %xmm8, %xmm2
-; AVX-NEXT:    vmovdqa %xmm5, %xmm8
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13]
-; AVX-NEXT:    vpor %xmm2, %xmm5, %xmm2
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm15
-; AVX-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm0
-; AVX-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX-NEXT:    vpshufb %xmm9, %xmm13, %xmm2
-; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
+; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm1
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u]
+; AVX-NEXT:    vmovdqa (%rax), %xmm4
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb %xmm11, %xmm4, %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,u,u,u,u,u,10,11,u,u,u,u,u,12,13]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128]
+; AVX-NEXT:    vpshufb %xmm7, %xmm4, %xmm5
+; AVX-NEXT:    vpor %xmm5, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm11
+; AVX-NEXT:    vmovdqa (%rcx), %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT:    vpshufb %xmm15, %xmm2, %xmm0
+; AVX-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u]
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
 ; AVX-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm2
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u,u],zero
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9]
-; AVX-NEXT:    vpor %xmm2, %xmm11, %xmm2
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm12, %xmm9, %xmm11
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm11, %ymm11
+; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9]
+; AVX-NEXT:    vpor %xmm0, %xmm8, %xmm0
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
+; AVX-NEXT:    vpshufb %xmm8, %xmm4, %xmm15
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm15, %ymm15
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
-; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
-; AVX-NEXT:    vandps %ymm4, %ymm11, %ymm11
-; AVX-NEXT:    vorps %ymm0, %ymm11, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
-; AVX-NEXT:    vandnps %ymm15, %ymm11, %ymm15
-; AVX-NEXT:    vandps %ymm0, %ymm11, %ymm0
-; AVX-NEXT:    vorps %ymm0, %ymm15, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
+; AVX-NEXT:    vandps %ymm4, %ymm15, %ymm15
+; AVX-NEXT:    vorps %ymm1, %ymm15, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm15 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
+; AVX-NEXT:    vandnps %ymm11, %ymm15, %ymm11
+; AVX-NEXT:    vandps %ymm1, %ymm15, %ymm1
+; AVX-NEXT:    vorps %ymm1, %ymm11, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb %xmm8, %xmm14, %xmm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm7, %xmm14, %xmm1
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13]
-; AVX-NEXT:    vpor %xmm0, %xmm11, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u]
-; AVX-NEXT:    vpor %xmm11, %xmm15, %xmm11
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm11, %ymm15
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
-; AVX-NEXT:    vpshufb %xmm12, %xmm11, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm12
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[8,9],zero,xmm0[u,u,u,u,10,11],zero,xmm0[u,u,u,u,12,13]
+; AVX-NEXT:    vpor %xmm1, %xmm11, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,4,5,u,u,u,u,u,6,7,u,u,u,u,u]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm11, %xmm11
+; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u]
+; AVX-NEXT:    vpor %xmm15, %xmm11, %xmm11
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm11, %ymm2
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; AVX-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
+; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm15, %ymm15
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm14, %ymm0
-; AVX-NEXT:    vandnps %ymm12, %ymm4, %ymm12
+; AVX-NEXT:    vandnps %ymm15, %ymm4, %ymm14
 ; AVX-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT:    vorps %ymm0, %ymm12, %ymm0
+; AVX-NEXT:    vorps %ymm0, %ymm14, %ymm0
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
-; AVX-NEXT:    vandnps %ymm15, %ymm4, %ymm12
+; AVX-NEXT:    vandnps %ymm2, %ymm4, %ymm1
 ; AVX-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT:    vorps %ymm0, %ymm12, %ymm0
+; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
-; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
+; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm4
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
-; AVX-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
-; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm6
-; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
-; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
-; AVX-NEXT:    vandps %ymm4, %ymm6, %ymm6
-; AVX-NEXT:    vorps %ymm0, %ymm6, %ymm0
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
-; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm12, %ymm6
-; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm0
-; AVX-NEXT:    vandnps %ymm6, %ymm12, %ymm6
-; AVX-NEXT:    vorps %ymm6, %ymm0, %ymm0
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0],zero,xmm6[2,3,4,5,6,7],zero,xmm6[9,10,11,12,13,14],zero
+; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
+; AVX-NEXT:    vandnps %ymm4, %ymm0, %ymm4
+; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm12
+; AVX-NEXT:    vorps %ymm4, %ymm12, %ymm4
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10]
+; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
+; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX-NEXT:    vandps %ymm4, %ymm13, %ymm4
+; AVX-NEXT:    vandnps %ymm12, %ymm13, %ymm12
+; AVX-NEXT:    vorps %ymm4, %ymm12, %ymm4
+; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm12
+; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[0],zero,xmm12[2,3,4,5,6,7],zero,xmm12[9,10,11,12,13,14],zero
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15]
-; AVX-NEXT:    vpor %xmm6, %xmm12, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero
-; AVX-NEXT:    vpor %xmm6, %xmm0, %xmm0
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
-; AVX-NEXT:    vpshufb %xmm10, %xmm12, %xmm2
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
-; AVX-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15]
+; AVX-NEXT:    vpor %xmm13, %xmm12, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7,8,9],zero,xmm4[11,12,13,14,15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm4, %xmm12, %xmm4
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
+; AVX-NEXT:    vpshufb %xmm6, %xmm13, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
-; AVX-NEXT:    vandnps %ymm1, %ymm5, %ymm1
-; AVX-NEXT:    vandps %ymm5, %ymm2, %ymm2
-; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u]
+; AVX-NEXT:    vandnps %ymm2, %ymm5, %ymm2
+; AVX-NEXT:    vandps %ymm5, %ymm1, %ymm1
+; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm10[u,u,u],zero,zero,xmm10[9,u,u,u,u],zero,zero,xmm10[10,u,u,u]
+; AVX-NEXT:    vmovdqa (%rsp), %xmm12 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u]
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3,u,u,u,u,u,4,5,u,u,u,u]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
 ; AVX-NEXT:    vpor %xmm2, %xmm6, %xmm2
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[u],zero,zero,xmm10[11,u,u,u,u],zero,zero,xmm10[12,u,u,u,u],zero
 ; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
 ; AVX-NEXT:    vandps %ymm5, %ymm1, %ymm1
 ; AVX-NEXT:    vandnps %ymm2, %ymm5, %ymm2
 ; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm6
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
+; AVX-NEXT:    vpshufb %xmm9, %xmm3, %xmm1
 ; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm2 = [16777216,197120]
-; AVX-NEXT:    vpshufb %xmm2, %xmm9, %xmm3
+; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm13[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5]
 ; AVX-NEXT:    vpshufb %xmm3, %xmm11, %xmm7
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
-; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
-; AVX-NEXT:    vandps %ymm4, %ymm5, %ymm4
-; AVX-NEXT:    vorps %ymm1, %ymm4, %ymm4
+; AVX-NEXT:    vandnps %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vandps %ymm0, %ymm5, %ymm0
+; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
-; AVX-NEXT:    vmovdqa (%rsp), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3]
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[10],zero,xmm7[u,u,u,u,13,12],zero,xmm7[u,u,u,u,15,14],zero
-; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm8[13,u,u,u,u],zero,zero,xmm8[14,u,u,u,u],zero,zero,xmm8[15]
-; AVX-NEXT:    vpor %xmm7, %xmm9, %xmm7
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
-; AVX-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm15[8],xmm12[9],xmm15[9],xmm12[10],xmm15[10],xmm12[11],xmm15[11],xmm12[12],xmm15[12],xmm12[13],xmm15[13],xmm12[14],xmm15[14],xmm12[15],xmm15[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7
+; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm10[13,u,u,u,u],zero,zero,xmm10[14,u,u,u,u],zero,zero,xmm10[15]
+; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
+; AVX-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
-; AVX-NEXT:    vandps %ymm7, %ymm4, %ymm4
+; AVX-NEXT:    vandps %ymm7, %ymm0, %ymm0
 ; AVX-NEXT:    vandnps %ymm5, %ymm7, %ymm5
-; AVX-NEXT:    vorps %ymm5, %ymm4, %ymm5
+; AVX-NEXT:    vorps %ymm5, %ymm0, %ymm5
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
 ; AVX-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
 ; AVX-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0]
-; AVX-NEXT:    vandnps %ymm2, %ymm4, %ymm2
-; AVX-NEXT:    vandps %ymm4, %ymm3, %ymm3
-; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0]
+; AVX-NEXT:    vandnps %ymm0, %ymm3, %ymm0
+; AVX-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm10[u,u],zero,zero,xmm10[2,u,u,u,u],zero,zero,xmm10[3,u,u,u,u]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u]
-; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u]
+; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3]
-; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; AVX-NEXT:    vandnps %ymm1, %ymm3, %ymm1
-; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3]
+; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
+; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovaps %ymm1, (%rax)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 128(%rax)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
+; AVX-NEXT:    vmovaps %ymm0, (%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm0, 128(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX-NEXT:    vmovaps %ymm5, 96(%rax)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 160(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm0, 160(%rax)
 ; AVX-NEXT:    vmovaps %ymm6, 64(%rax)
-; AVX-NEXT:    vmovdqa %xmm0, 192(%rax)
+; AVX-NEXT:    vmovdqa %xmm4, 192(%rax)
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovaps %xmm0, 208(%rax)
-; AVX-NEXT:    addq $216, %rsp
+; AVX-NEXT:    addq $184, %rsp
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -6622,7 +6619,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
 ; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX-NEXT:    vmovdqa 16(%rsi), %xmm7
@@ -6656,19 +6654,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u]
 ; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm3, %xmm8
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
-; AVX-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
-; AVX-NEXT:    vmovdqa %xmm4, %xmm10
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
+; AVX-NEXT:    vpshufb %xmm5, %xmm10, %xmm3
 ; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u]
-; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u]
 ; AVX-NEXT:    vpshufb %xmm4, %xmm6, %xmm3
+; AVX-NEXT:    vmovdqa %xmm4, %xmm6
 ; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
-; AVX-NEXT:    vmovdqa %xmm3, %xmm12
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
+; AVX-NEXT:    vpshufb %xmm10, %xmm12, %xmm1
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u]
 ; AVX-NEXT:    vpshufb %xmm9, %xmm13, %xmm3
 ; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
@@ -6694,32 +6690,34 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 32(%r8), %xmm3
-; AVX-NEXT:    vmovdqa 32(%r9), %xmm11
-; AVX-NEXT:    vpshufb %xmm8, %xmm11, %xmm0
-; AVX-NEXT:    vpshufb %xmm10, %xmm3, %xmm1
+; AVX-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX-NEXT:    vpshufb %xmm8, %xmm4, %xmm0
+; AVX-NEXT:    vpshufb %xmm5, %xmm11, %xmm1
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u]
 ; AVX-NEXT:    vmovdqa 32(%rax), %xmm8
-; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm1
+; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm1
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[4,u,u,u,u],zero,zero,xmm8[5,u,u,u,u],zero,zero
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; AVX-NEXT:    vmovdqa %xmm4, %xmm12
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm3, %xmm10
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13]
-; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,u,u,u,u,u,10,11,u,u,u,u,u,12,13]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm8[4,u,u,u,u],zero,zero,xmm8[5,u,u,u,u],zero,zero
+; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm4
 ; AVX-NEXT:    vmovdqa 32(%rcx), %xmm0
 ; AVX-NEXT:    vmovdqa 32(%rdx), %xmm2
-; AVX-NEXT:    vpshufb %xmm12, %xmm0, %xmm1
+; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm1
 ; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm3
 ; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
+; AVX-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm5
 ; AVX-NEXT:    vmovdqa 32(%rsi), %xmm1
 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm3
@@ -6728,12 +6726,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
+; AVX-NEXT:    vpshufb %xmm10, %xmm7, %xmm7
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
-; AVX-NEXT:    vandnps %ymm5, %ymm7, %ymm5
-; AVX-NEXT:    vandps %ymm7, %ymm6, %ymm6
+; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
+; AVX-NEXT:    vandnps %ymm5, %ymm13, %ymm5
+; AVX-NEXT:    vandps %ymm6, %ymm13, %ymm6
 ; AVX-NEXT:    vorps %ymm5, %ymm6, %ymm5
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
 ; AVX-NEXT:    vandnps %ymm4, %ymm6, %ymm4
@@ -6742,10 +6740,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10]
 ; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3,u,u,u,u,u,4,5,u,u,u,u]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u]
 ; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
@@ -6771,41 +6770,40 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 48(%rax), %xmm12
+; AVX-NEXT:    vmovdqa 48(%rax), %xmm14
 ; AVX-NEXT:    vmovdqa 48(%r8), %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 48(%r9), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128]
-; AVX-NEXT:    vpshufb %xmm5, %xmm12, %xmm0
+; AVX-NEXT:    vpshufb %xmm5, %xmm14, %xmm0
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13]
-; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13]
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u]
-; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,4,5,u,u,u,u,u,6,7,u,u,u,u,u]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u]
+; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX-NEXT:    vmovdqa 48(%rsi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 48(%rdi), %xmm10
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
+; AVX-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT:    vmovdqa 48(%rcx), %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 48(%rdx), %xmm13
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
+; AVX-NEXT:    vmovdqa 48(%rcx), %xmm10
+; AVX-NEXT:    vmovdqa 48(%rdx), %xmm12
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
 ; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
 ; AVX-NEXT:    vpshufb %xmm9, %xmm4, %xmm3
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm11, %ymm3
-; AVX-NEXT:    vandnps %ymm1, %ymm7, %ymm1
-; AVX-NEXT:    vandps %ymm7, %ymm3, %ymm3
+; AVX-NEXT:    vandnps %ymm1, %ymm13, %ymm1
+; AVX-NEXT:    vandps %ymm3, %ymm13, %ymm3
 ; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
 ; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
@@ -6821,35 +6819,37 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vmovdqa (%rax), %xmm7
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[6,u,u,u,u],zero,zero,xmm7[7,u,u,u,u],zero,zero,xmm7[8,u]
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm5, %xmm7, %xmm1
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb %xmm14, %xmm4, %xmm15
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,u,u,u,u,u,10,11,u,u,u,u,u,12,13]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm5, %xmm7, %xmm15
 ; AVX-NEXT:    vpor %xmm1, %xmm15, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm15
 ; AVX-NEXT:    vmovdqa (%rcx), %xmm5
 ; AVX-NEXT:    vmovdqa (%rdx), %xmm3
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u]
-; AVX-NEXT:    vpor %xmm0, %xmm14, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u]
+; AVX-NEXT:    vpor %xmm0, %xmm13, %xmm0
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm14
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm14, %ymm4
-; AVX-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm13
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm13, %ymm4
+; AVX-NEXT:    vmovdqa (%rsi), %xmm13
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm13[u],zero,xmm13[7,u,u,u,u,u],zero,xmm13[8,u,u,u,u,u],zero
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9]
 ; AVX-NEXT:    vpor %xmm1, %xmm11, %xmm1
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
+; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm11
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm11, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
-; AVX-NEXT:    vandnps %ymm4, %ymm9, %ymm4
-; AVX-NEXT:    vandps %ymm1, %ymm9, %ymm1
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
+; AVX-NEXT:    vandnps %ymm4, %ymm0, %ymm4
+; AVX-NEXT:    vandps %ymm0, %ymm1, %ymm1
 ; AVX-NEXT:    vorps %ymm4, %ymm1, %ymm1
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
 ; AVX-NEXT:    vandnps %ymm15, %ymm0, %ymm4
@@ -6861,25 +6861,26 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10]
 ; AVX-NEXT:    vpor %xmm1, %xmm11, %xmm1
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3,u,u,u,u,u,4,5,u,u,u,u]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u]
 ; AVX-NEXT:    vpor %xmm4, %xmm11, %xmm4
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
-; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
-; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
-; AVX-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
+; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
+; AVX-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
+; AVX-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
-; AVX-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
-; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
+; AVX-NEXT:    vpshufb %xmm8, %xmm4, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
+; AVX-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
 ; AVX-NEXT:    vandnps %ymm0, %ymm3, %ymm0
@@ -6891,239 +6892,242 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13]
-; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13]
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,4,5,u,u,u,u,u,6,7,u,u,u,u,u]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u]
-; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm11
+; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm2
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm3
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm15[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm4
-; AVX-NEXT:    vandnps %ymm3, %ymm9, %ymm3
-; AVX-NEXT:    vandps %ymm4, %ymm9, %ymm4
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
+; AVX-NEXT:    vandnps %ymm3, %ymm0, %ymm3
+; AVX-NEXT:    vandps %ymm0, %ymm4, %ymm4
 ; AVX-NEXT:    vorps %ymm3, %ymm4, %ymm3
-; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
-; AVX-NEXT:    vandnps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vandps %ymm1, %ymm3, %ymm3
-; AVX-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
+; AVX-NEXT:    vandnps %ymm2, %ymm0, %ymm2
+; AVX-NEXT:    vandps %ymm0, %ymm3, %ymm3
+; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa (%rsp), %xmm9 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u,u,u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u]
-; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm3
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
-; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm4
+; AVX-NEXT:    vmovdqa %xmm10, %xmm6
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm10[u,u,u],zero,xmm10[7,u,u,u,u,u],zero,xmm10[8,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,7],zero,xmm12[u,u,u,u,u,8],zero,xmm12[u,u,u,u]
+; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
+; AVX-NEXT:    vpshufb %xmm13, %xmm10, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm9[u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u,u],zero
+; AVX-NEXT:    vmovdqa (%rsp), %xmm11 # 16-byte Reload
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm11[u,7],zero,xmm11[u,u,u,u,u,8],zero,xmm11[u,u,u,u,u,9]
+; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
+; AVX-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u],zero,xmm8[7,u,u,u,u,u],zero,xmm8[8,u,u,u,u,u],zero
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u,u,9]
-; AVX-NEXT:    vpor %xmm4, %xmm11, %xmm4
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15]
-; AVX-NEXT:    vpshufb %xmm6, %xmm11, %xmm11
-; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm4, %ymm4
-; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
-; AVX-NEXT:    vandnps %ymm3, %ymm1, %ymm3
-; AVX-NEXT:    vandps %ymm1, %ymm4, %ymm4
-; AVX-NEXT:    vorps %ymm3, %ymm4, %ymm3
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[u,u,6,7,8,9],zero,xmm4[u,u,13,14,15]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm6[9,u,u],zero,zero,zero,zero,xmm6[10,u,u],zero,zero,zero
-; AVX-NEXT:    vpor %xmm4, %xmm11, %xmm4
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[u,6,7,8,9,10],zero,xmm4[u,13,14,15]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero
-; AVX-NEXT:    vpor %xmm4, %xmm11, %xmm4
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10,11],zero,xmm4[13,14,15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm12[9],zero,zero,zero,zero,zero,zero,xmm12[10],zero,zero,zero
-; AVX-NEXT:    vpor %xmm4, %xmm11, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4],zero,xmm3[u,u,8,9,10,11],zero,xmm3[u,u,15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u],zero,zero,zero,zero,xmm6[7,u,u],zero,zero,zero,zero,xmm6[8,u,u],zero
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
+; AVX-NEXT:    vandnps %ymm2, %ymm0, %ymm2
+; AVX-NEXT:    vandps %ymm0, %ymm3, %ymm3
+; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[u,u,6,7,8,9],zero,xmm3[u,u,13,14,15]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[9,u,u],zero,zero,zero,zero,xmm1[10,u,u],zero,zero,zero
 ; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4,5],zero,xmm3[u,8,9,10,11,12],zero,xmm3[u,15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3],zero,xmm3[u,6,7,8,9,10],zero,xmm3[u,13,14,15]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero
 ; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4,5,6],zero,xmm3[8,9,10,11,12,13],zero,xmm3[15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm12[6],zero,zero,zero,zero,zero,zero,xmm12[7],zero,zero,zero,zero,zero,zero,xmm12[8],zero
-; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; AVX-NEXT:    vpshufb %xmm14, %xmm3, %xmm4
-; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm10 = [218890240,986624]
-; AVX-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15]
-; AVX-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
-; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
-; AVX-NEXT:    vandnps %ymm3, %ymm8, %ymm3
-; AVX-NEXT:    vandps %ymm0, %ymm8, %ymm0
-; AVX-NEXT:    vorps %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm6[11,u,u],zero,zero,zero,zero,xmm6[12,u,u],zero,zero,zero,zero,xmm6[13]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10,11],zero,xmm3[13,14,15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm14[9],zero,zero,zero,zero,zero,zero,xmm14[10],zero,zero,zero
+; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,1,2,3,4],zero,xmm2[u,u,8,9,10,11],zero,xmm2[u,u,15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u],zero,zero,zero,zero,xmm1[7,u,u],zero,zero,zero,zero,xmm1[8,u,u],zero
+; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,1,2,3,4,5],zero,xmm2[u,8,9,10,11,12],zero,xmm2[u,15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero
+; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm14[6],zero,zero,zero,zero,zero,zero,xmm14[7],zero,zero,zero,zero,zero,zero,xmm14[8],zero
+; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
+; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm0 = [218890240,986624]
+; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15]
+; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
+; AVX-NEXT:    vpshufb %xmm7, %xmm10, %xmm4
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
+; AVX-NEXT:    vandnps %ymm2, %ymm6, %ymm2
+; AVX-NEXT:    vandps %ymm6, %ymm3, %ymm3
+; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[0],zero,xmm2[u,u,4,5,6,7],zero,xmm2[u,u,11,12,13,14],zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm1[11,u,u],zero,zero,zero,zero,xmm1[12,u,u],zero,zero,zero,zero,xmm1[13]
 ; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[u,4,5,6,7,8],zero,xmm3[u,11,12,13,14,15]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[11,u],zero,zero,zero,zero,zero,xmm5[12,u],zero,zero,zero,zero,zero
 ; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[4,5,6,7,8,9],zero,xmm3[11,12,13,14,15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[11],zero,zero,zero,zero,zero,zero,xmm12[12],zero,zero,zero,zero,zero
-; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm6[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[u,4,5,6,7,0],zero,xmm0[u,11,12,13,14,1],zero,xmm0[u]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm2[u,4,5,6,7,0],zero,xmm2[u,11,12,13,14,1],zero,xmm2[u]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[13,u],zero,zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,zero,xmm5[15,u]
-; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6,7],zero,xmm0[9,10,11,12,13,14],zero
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15]
-; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm15 = [16777216,197120]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6,7],zero,xmm2[9,10,11,12,13,14],zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15]
+; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm10 = [16777216,197120]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm3
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX-NEXT:    vpshufb %xmm10, %xmm1, %xmm2
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm7, %xmm1, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm4
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm10, %xmm1, %xmm7
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm7
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm4
-; AVX-NEXT:    vandnps %ymm0, %ymm8, %ymm0
-; AVX-NEXT:    vandps %ymm4, %ymm8, %ymm4
-; AVX-NEXT:    vorps %ymm0, %ymm4, %ymm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm4 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[10],zero,xmm4[u,u,u,u,13,12],zero,xmm4[u,u,u,u,15,14],zero
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm14[13,u,u,u,u],zero,zero,xmm14[14,u,u,u,u],zero,zero,xmm14[15]
-; AVX-NEXT:    vpor %xmm7, %xmm4, %xmm7
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm10
-; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
-; AVX-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
-; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
-; AVX-NEXT:    vandps %ymm7, %ymm0, %ymm0
-; AVX-NEXT:    vandnps %ymm5, %ymm7, %ymm5
-; AVX-NEXT:    vorps %ymm5, %ymm0, %ymm0
+; AVX-NEXT:    vandnps %ymm2, %ymm6, %ymm2
+; AVX-NEXT:    vandps %ymm6, %ymm4, %ymm4
+; AVX-NEXT:    vorps %ymm2, %ymm4, %ymm2
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9, %xmm9
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,xmm7[13,u,u,u,u],zero,zero,xmm7[14,u,u,u,u],zero,zero,xmm7[15]
+; AVX-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
+; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
+; AVX-NEXT:    vpshufb %xmm13, %xmm5, %xmm5
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm9, %ymm5
+; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
+; AVX-NEXT:    vandps %ymm2, %ymm12, %ymm2
+; AVX-NEXT:    vandnps %ymm5, %ymm12, %ymm5
+; AVX-NEXT:    vorps %ymm5, %ymm2, %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm5
-; AVX-NEXT:    vpshufb %xmm15, %xmm8, %xmm10
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm10, %ymm5
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm5
+; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm9
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm9, %ymm5
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
-; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm10
-; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm12
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm12, %ymm10
-; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0]
-; AVX-NEXT:    vandnps %ymm5, %ymm12, %ymm5
-; AVX-NEXT:    vandps %ymm12, %ymm10, %ymm10
-; AVX-NEXT:    vorps %ymm5, %ymm10, %ymm5
+; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm9
+; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm14
+; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm14, %ymm9
+; AVX-NEXT:    vmovaps {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0]
+; AVX-NEXT:    vandnps %ymm5, %ymm14, %ymm5
+; AVX-NEXT:    vandps %ymm14, %ymm9, %ymm9
+; AVX-NEXT:    vorps %ymm5, %ymm9, %ymm5
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,128,128,2,u,u,u,u,128,128,3,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm0, %xmm14, %xmm10
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm0, %xmm7, %xmm9
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,4,5,128,u,u,u,u,6,7,128,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm1, %xmm8, %xmm13
-; AVX-NEXT:    vpor %xmm10, %xmm13, %xmm10
-; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm13
-; AVX-NEXT:    vpalignr {{.*#+}} xmm13 = xmm13[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3]
-; AVX-NEXT:    vpshufb %xmm11, %xmm13, %xmm13
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm13, %ymm10
-; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX-NEXT:    vandps %ymm5, %ymm13, %ymm5
-; AVX-NEXT:    vandnps %ymm10, %ymm13, %ymm10
-; AVX-NEXT:    vorps %ymm5, %ymm10, %ymm10
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm5
-; AVX-NEXT:    vpshufb %xmm15, %xmm8, %xmm14
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm14, %ymm5
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm6, %xmm2, %xmm14
-; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm8
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm8, %ymm8
-; AVX-NEXT:    vandnps %ymm5, %ymm12, %ymm5
-; AVX-NEXT:    vandps %ymm12, %ymm8, %ymm8
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm1, %xmm4, %xmm8
+; AVX-NEXT:    vpor %xmm9, %xmm8, %xmm8
+; AVX-NEXT:    vpshufb %xmm11, %xmm4, %xmm9
+; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm9[4,5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3]
+; AVX-NEXT:    vpshufb %xmm13, %xmm9, %xmm9
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
+; AVX-NEXT:    vmovaps {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
+; AVX-NEXT:    vandps %ymm5, %ymm9, %ymm5
+; AVX-NEXT:    vandnps %ymm8, %ymm9, %ymm8
 ; AVX-NEXT:    vorps %ymm5, %ymm8, %ymm5
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm2, %xmm4, %xmm8
+; AVX-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm4, %ymm4
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm6, %xmm2, %xmm8
+; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm7
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm7, %ymm7
+; AVX-NEXT:    vandnps %ymm4, %ymm14, %ymm4
+; AVX-NEXT:    vandps %ymm7, %ymm14, %ymm7
+; AVX-NEXT:    vorps %ymm4, %ymm7, %ymm4
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm12
-; AVX-NEXT:    vpor %xmm8, %xmm12, %xmm8
-; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm12
-; AVX-NEXT:    vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3]
-; AVX-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm12, %ymm8
-; AVX-NEXT:    vandps %ymm5, %ymm13, %ymm5
-; AVX-NEXT:    vandnps %ymm8, %ymm13, %ymm8
-; AVX-NEXT:    vorps %ymm5, %ymm8, %ymm5
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm8
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
-; AVX-NEXT:    vandnps %ymm8, %ymm0, %ymm3
-; AVX-NEXT:    vandps %ymm0, %ymm2, %ymm2
-; AVX-NEXT:    vorps %ymm3, %ymm2, %ymm2
+; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm7
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vpor %xmm7, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm7
+; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3]
+; AVX-NEXT:    vpshufb %xmm13, %xmm7, %xmm7
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX-NEXT:    vandps %ymm4, %ymm9, %ymm4
+; AVX-NEXT:    vandnps %ymm0, %ymm9, %ymm0
+; AVX-NEXT:    vorps %ymm0, %ymm4, %ymm0
+; AVX-NEXT:    vpshufb %xmm10, %xmm15, %xmm1
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm3
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
+; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX-NEXT:    vandps %ymm2, %ymm3, %ymm3
+; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero
-; AVX-NEXT:    vpor %xmm3, %xmm6, %xmm3
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm1
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
-; AVX-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT:    vandps %ymm7, %ymm2, %ymm2
-; AVX-NEXT:    vandnps %ymm1, %ymm7, %ymm1
-; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm4
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+; AVX-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
+; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX-NEXT:    vandps %ymm1, %ymm12, %ymm1
+; AVX-NEXT:    vandnps %ymm3, %ymm12, %ymm3
+; AVX-NEXT:    vorps %ymm3, %ymm1, %ymm1
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 128(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, 128(%rax)
 ; AVX-NEXT:    vmovaps %ymm1, 96(%rax)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 64(%rax)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX-NEXT:    vmovaps %ymm5, (%rax)
-; AVX-NEXT:    vmovaps %ymm10, 224(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm1, 352(%rax)
+; AVX-NEXT:    vmovaps %ymm1, 64(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
+; AVX-NEXT:    vmovaps %ymm0, (%rax)
+; AVX-NEXT:    vmovaps %ymm5, 224(%rax)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm0, 352(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vmovaps %ymm0, 320(%rax)
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -8403,379 +8407,374 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqa (%rsi), %ymm3
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm23
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm22
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm10
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
 ; AVX512-NEXT:    vpshufb %ymm2, %ymm10, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm9
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm8, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm7
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm7, %ymm1
+; AVX512-NEXT:    vpshufb %ymm1, %ymm9, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm8
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512-NEXT:    vmovdqa (%r8), %ymm15
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm6, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512-NEXT:    vmovdqa (%r9), %ymm5
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
-; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm5, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512-NEXT:    vpshufb %ymm1, %ymm15, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
+; AVX512-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm5, %ymm7, %ymm1
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm14, %ymm3, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm10, %ymm2
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
-; AVX512-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm15, %ymm7, %ymm1
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
-; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm8, %ymm3
-; AVX512-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm6, %ymm1
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
-; AVX512-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm5, %ymm13
-; AVX512-NEXT:    vpor %ymm1, %ymm13, %ymm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm13
-; AVX512-NEXT:    vpshufb %ymm14, %ymm13, %ymm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm14
-; AVX512-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero,zero,ymm14[27],zero,ymm14[25]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23,u,u,u],zero,ymm13[26],zero,ymm13[24,u,u,u],zero,ymm13[27],zero
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
+; AVX512-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm3, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512-NEXT:    vpshufb %ymm14, %ymm10, %ymm1
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm1
-; AVX512-NEXT:    vpshufb %ymm15, %ymm1, %ymm15
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm0
-; AVX512-NEXT:    vpshufb %ymm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpor %ymm2, %ymm15, %ymm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm12, %ymm1, %ymm15
-; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm22
-; AVX512-NEXT:    vpor %ymm2, %ymm15, %ymm2
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm15
-; AVX512-NEXT:    vpshufb %ymm3, %ymm15, %ymm3
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm2
-; AVX512-NEXT:    vpshufb %ymm9, %ymm2, %ymm9
-; AVX512-NEXT:    vpor %ymm3, %ymm9, %ymm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm21
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31]
-; AVX512-NEXT:    vpor %ymm3, %ymm9, %ymm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm9
-; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm2, %ymm8, %ymm0
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
 ; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm9, %ymm12
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm20
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm12[2,3,2,3],zmm3[2,3,2,3]
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm3
-; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpshufb %ymm11, %ymm1, %ymm11
-; AVX512-NEXT:    vpor %ymm3, %ymm11, %ymm3
+; AVX512-NEXT:    vpshufb %ymm4, %ymm9, %ymm3
+; AVX512-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm0, %ymm15, %ymm3
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm7, %ymm6
+; AVX512-NEXT:    vpor %ymm3, %ymm6, %ymm3
 ; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm13
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm6
+; AVX512-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
-; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm26
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm14, %ymm1
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm13[23],zero,ymm13[23,24,25,26],zero,ymm13[24],zero,ymm13[30,31]
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm3
+; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
 ; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm15, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
+; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm20
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm1
-; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512-NEXT:    vpshufb %ymm11, %ymm1, %ymm4
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512-NEXT:    vpshufb %ymm14, %ymm0, %ymm11
+; AVX512-NEXT:    vpor %ymm4, %ymm11, %ymm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = zero,ymm0[7],zero,zero,zero,zero,ymm0[10],zero,ymm0[8],zero,zero,zero,zero,ymm0[11],zero,ymm0[9],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = ymm1[9],zero,ymm1[7,u,u,u],zero,ymm1[10],zero,ymm1[8,u,u,u],zero,ymm1[11],zero,ymm1[25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero
+; AVX512-NEXT:    vpor %ymm4, %ymm11, %ymm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    vmovdqa 32(%rax), %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm14[2,3,2,3],zmm11[2,3,2,3]
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm4
+; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm11
+; AVX512-NEXT:    vpshufb %ymm12, %ymm3, %ymm12
+; AVX512-NEXT:    vpor %ymm11, %ymm12, %ymm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX512-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm18
+; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512-NEXT:    vpshufb %ymm2, %ymm1, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm6, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm2, %ymm13, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
+; AVX512-NEXT:    vpshufb %ymm5, %ymm6, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm16
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm19
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm27
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm31
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX512-NEXT:    vpshufb %xmm14, %xmm2, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm30
 ; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm31
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm30
-; AVX512-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[4,u,u,u],zero,xmm1[7],zero,xmm1[5,u,u,u],zero,xmm1[8],zero,xmm1[6]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[4],zero,xmm3[u,u,u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm23
-; AVX512-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25]
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm3
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm1
-; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512-NEXT:    vpshufb %ymm2, %ymm7, %ymm2
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %ymm4, %ymm8, %ymm1
-; AVX512-NEXT:    vpshufb %ymm9, %ymm7, %ymm2
-; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm1
-; AVX512-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm2
-; AVX512-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rax), %ymm4
+; AVX512-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm28
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[4],zero,xmm2[u,u,u,7],zero,xmm2[5],zero,xmm2[u,u,u,8],zero,xmm2[6],zero
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm22
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
 ; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm2
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm18
+; AVX512-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb %ymm11, %ymm9, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm1
+; AVX512-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
+; AVX512-NEXT:    vpshufb %ymm12, %ymm10, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23,u,u,u],zero,ymm2[26],zero,ymm2[24,u,u,u],zero,ymm2[27],zero
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm1
+; AVX512-NEXT:    vpshufb %ymm1, %ymm7, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31]
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm1
+; AVX512-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rax), %ymm4
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-NEXT:    vpermi2d %zmm0, %zmm2, %zmm16
 ; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512-NEXT:    vpshufb %xmm11, %xmm13, %xmm2
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm10
-; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-NEXT:    vporq %xmm2, %xmm0, %xmm26
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm11
-; AVX512-NEXT:    vpshufb %xmm12, %xmm11, %xmm0
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm8
-; AVX512-NEXT:    vpshufb %xmm14, %xmm8, %xmm2
-; AVX512-NEXT:    vporq %xmm0, %xmm2, %xmm24
+; AVX512-NEXT:    vpshufb %xmm6, %xmm13, %xmm0
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm11
+; AVX512-NEXT:    vpshufb %xmm14, %xmm11, %xmm2
+; AVX512-NEXT:    vporq %xmm0, %xmm2, %xmm25
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm12
+; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm0
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm9
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm2
+; AVX512-NEXT:    vporq %xmm0, %xmm2, %xmm23
 ; AVX512-NEXT:    vmovdqa 32(%rax), %xmm1
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6]
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm27
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm25
-; AVX512-NEXT:    vpermi2d %zmm0, %zmm2, %zmm20
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm15
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm15[4,u,u,u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6]
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm26
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512-NEXT:    vpermi2d %zmm0, %zmm2, %zmm19
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm6
+; AVX512-NEXT:    vpshufb %xmm5, %xmm6, %xmm0
 ; AVX512-NEXT:    vmovdqa 32(%r8), %xmm5
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[4],zero,xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero
-; AVX512-NEXT:    vporq %xmm0, %xmm3, %xmm22
+; AVX512-NEXT:    vporq %xmm0, %xmm3, %xmm20
 ; AVX512-NEXT:    vmovdqa (%rax), %xmm2
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,6]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm4, %ymm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm3
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm4, %ymm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm3, %zmm3
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm6[0,1,0,1]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm0[2,3,2,3],zmm7[0,1,0,1]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
+; AVX512-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm0[2,3,2,3],zmm7[0,1,0,1]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm0[2,3,2,3],zmm8[0,1,0,1]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm19[2,3,2,3],zmm0[0,1,0,1]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm21
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm17
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT:    vpshufb %ymm9, %ymm0, %ymm9
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2]
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm9
-; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm16
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm0
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm28 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm28 = zmm4[0,1,0,1],mem[0,1,0,1]
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm12
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; AVX512-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm6 ^ (zmm14 & (zmm7 ^ zmm6))
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm6 = mem[2,3,2,3]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm0, %ymm4
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2]
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm4, %zmm4
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm1
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
+; AVX512-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm27 = zmm0[0,1,0,1],mem[0,1,0,1]
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm15
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; AVX512-NEXT:    vpshufb %xmm14, %xmm15, %xmm14
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm7 ^ (zmm15 & (zmm8 ^ zmm7))
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm7 = mem[2,3,2,3]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm31, %zmm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm31, %zmm7
 ; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm31 = mem[2,3,2,3]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm31
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm6 ^ (zmm14 & (zmm31 ^ zmm6))
-; AVX512-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm6 = zmm12[0,1,0,1],mem[0,1,0,1]
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm14
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
-; AVX512-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm23 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm23 = zmm14[0,1,0,1],mem[0,1,0,1]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm14
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm7 ^ (zmm15 & (zmm31 ^ zmm7))
+; AVX512-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm7 = zmm14[0,1,0,1],mem[0,1,0,1]
+; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm15
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
+; AVX512-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm22 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm22 = zmm15[0,1,0,1],mem[0,1,0,1]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm15
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[1,1,0,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm1, %zmm1
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm1, %zmm28
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
-; AVX512-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm26[0,1,0,1],zmm10[0,1,0,1]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm24[0,1,0,1],zmm8[0,1,0,1]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512-NEXT:    vpshufb %xmm5, %xmm12, %xmm11
-; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm22[0,1,0,1],zmm2[0,1,0,1]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm5))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb %xmm13, %xmm11, %xmm11
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm11 = zmm25[0,1,0,1],zmm11[0,1,0,1]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
+; AVX512-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm23[0,1,0,1],zmm9[0,1,0,1]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm14, %xmm12
+; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm20[0,1,0,1],zmm2[0,1,0,1]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm6 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm13 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm14 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm13 ^ (mem & (zmm14 ^ zmm13))
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm6 ^ (zmm13 & (zmm14 ^ zmm6))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm6 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm17[2,3,2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm15
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[1,1,0,0,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm17 = zmm17[2,3,2,3],mem[2,3,2,3]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm12 ^ (zmm13 & (zmm17 ^ zmm12))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm10 ^ (zmm13 & (zmm8 ^ zmm10))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm17[2,3,2,3]
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm17 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm17 = zmm5[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm13 & (zmm17 ^ zmm6))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm11 ^ (zmm13 & (zmm9 ^ zmm11))
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm6 # 32-byte Folded Reload
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0))
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm0 & mem)
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm4))
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm15[0,0,1,0]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm14, %zmm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm19))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm5))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm4 = zmm4[0,1,2,3],mem[2,3,2,3]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm4 & mem)
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm31))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm28))
-; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm23))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6))
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512-NEXT:    vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm4))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm17))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm20 ^ (mem & (zmm2 ^ zmm20))
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm8))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6))
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm6 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[0,0,1,0]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm15, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm18))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm8))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm14))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm31))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm27))
+; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm28[0,0,1,0,4,4,5,4]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm22))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm7))
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm6))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm17))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm19 ^ (mem & (zmm2 ^ zmm19))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm9))
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm18, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm16, 128(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm4, 320(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm29, 384(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm0, 192(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm3, 64(%rax)
@@ -8785,382 +8784,374 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $1416, %rsp # imm = 0x588
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm12
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero
+; AVX512-FCP-NEXT:    subq $1480, %rsp # imm = 0x5C8
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm9
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero,zero,zero,ymm9[18]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[0,1,14],zero,ymm6[12,13,0,1,14,15],zero,ymm6[3,12,13,2,3,16],zero,ymm6[30,31,28,29,16,17],zero,ymm6[31,18,19,28,29,18],zero
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[0,1,0,1,14],zero,ymm4[14,15,0,1,14,15],zero,ymm4[13,14,15,16,17,16],zero,ymm4[30,31,30,31,16,17],zero,ymm4[31,28,29,30,31]
-; AVX512-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31]
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm8
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm11
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm11[13,u,u,u,u,u],zero,ymm11[14,u,u,u,u,u],zero,ymm11[15,u,u,u,u,u],zero,ymm11[16,u,u,u,u,u],zero,ymm11[17,u,u,u]
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm15
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm12
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm12[13,u,u,u,u,u],zero,ymm12[14,u,u,u,u,u],zero,ymm12[15,u,u,u,u,u],zero,ymm12[16,u,u,u,u,u],zero,ymm12[17,u,u,u]
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm15
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm23
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm18
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512-FCP-NEXT:    vpor %ymm5, %ymm7, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm7
-; AVX512-FCP-NEXT:    vpor %ymm5, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm14
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm10
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512-FCP-NEXT:    vpor %ymm5, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
+; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm7
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
 ; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm0
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm8
+; AVX512-FCP-NEXT:    vpor %ymm7, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
 ; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm14, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm27
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm26
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm14, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm7[7],zero,zero,zero,zero,ymm7[10],zero,ymm7[8],zero,zero,zero,zero,ymm7[11],zero,ymm7[9],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero,ymm7[25]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm8[9],zero,ymm8[7,u,u,u],zero,ymm8[10],zero,ymm8[8,u,u,u],zero,ymm8[11],zero,ymm8[25],zero,ymm8[23,u,u,u],zero,ymm8[26],zero,ymm8[24,u,u,u],zero,ymm8[27],zero
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm14, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm16
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[2,3,2,3],zmm1[2,3,2,3]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm14[2,3,2,3],zmm1[2,3,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm7
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm9[19],zero,ymm9[21,20,21,22],zero,ymm9[20],zero,ymm9[22,23]
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[18],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm7
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[21],zero,ymm9[19],zero,zero,zero,zero,ymm9[22],zero,ymm9[20],zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm6[19],zero,ymm6[21,20,21,22],zero,ymm6[20],zero,ymm6[22,23]
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm26
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero,ymm6[25]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23,u,u,u],zero,ymm9[26],zero,ymm9[24,u,u,u],zero,ymm9[27],zero
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm11, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[20],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm8[18],zero,ymm8[20,21,20,21],zero,ymm8[19],zero,ymm8[19,20,21,22],zero
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm29
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm5
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm5, %ymm30
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm2
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0]
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[1,1,0,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm0, %zmm29
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[1,1,0,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm31
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm24
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero,xmm5[u,u]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm23
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm14
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,7],zero,xmm4[5],zero,xmm4[u,u,u,8],zero,xmm4[6],zero,xmm4[u,u]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm28
 ; AVX512-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm12
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm12, %xmm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm7, %xmm21
-; AVX512-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,6]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm16
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm27
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm23
+; AVX512-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,6]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
-; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm25
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm11
+; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm25
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm22
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm4
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm17
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm7, %xmm18
-; AVX512-FCP-NEXT:    vporq %xmm2, %xmm6, %xmm31
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm13[0,1,0,1,14],zero,ymm13[14,15,0,1,14,15],zero,ymm13[13,14,15,16,17,16],zero,ymm13[30,31,30,31,16,17],zero,ymm13[31,28,29,30,31]
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm6, %ymm2
+; AVX512-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm13[0,1,0,1,14],zero,ymm13[14,15,0,1,14,15],zero,ymm13[13,14,15,16,17,16],zero,ymm13[30,31,30,31,16,17],zero,ymm13[31,28,29,30,31]
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm3
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm13[18,19,20,21],zero,ymm13[19],zero,ymm13[25,26,27,22],zero,ymm13[20],zero
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm15[0,1,14],zero,ymm15[12,13,0,1,14,15],zero,ymm15[3,12,13,2,3,16],zero,ymm15[30,31,28,29,16,17],zero,ymm15[31,18,19,28,29,18],zero
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm7[0,1,14],zero,ymm7[12,13,0,1,14,15],zero,ymm7[3,12,13,2,3,16],zero,ymm7[30,31,28,29,16,17],zero,ymm7[31,18,19,28,29,18],zero
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm15[19],zero,ymm15[21,20,21,22],zero,ymm15[20],zero,ymm15[22,23]
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22],zero,ymm8[20],zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm7[19],zero,ymm7[21,20,21,22],zero,ymm7[20],zero,ymm7[22,23]
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm4
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm14[14],zero,zero,zero,zero,zero,zero,ymm14[15],zero,zero,zero,zero,zero,zero,ymm14[16],zero,zero,zero,zero,zero,zero,ymm14[17],zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[13,u,u,u,u,u],zero,ymm10[14,u,u,u,u,u],zero,ymm10[15,u,u,u,u,u],zero,ymm10[16,u,u,u,u,u],zero,ymm10[17,u,u,u]
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u]
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm10
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
-; AVX512-FCP-NEXT:    vporq %xmm1, %xmm0, %xmm20
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm13
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm8
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u]
-; AVX512-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm19
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm19
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
+; AVX512-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm21
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm12
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm12, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,7],zero,xmm6[5],zero,xmm6[u,u,u,8],zero,xmm6[6],zero,xmm6[u,u]
+; AVX512-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm20
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm9
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm9, %xmm0
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm2, %zmm16
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm15
-; AVX512-FCP-NEXT:    vporq %xmm3, %xmm15, %xmm18
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3]
-; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm14
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[1,1,0,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm16
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm11
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm13
+; AVX512-FCP-NEXT:    vporq %xmm10, %xmm13, %xmm18
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
+; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermd %ymm9, %ymm10, %ymm9
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm13
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm3[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [4,5,4,5,5,7,4,5,20,21,22,23,20,21,22,23]
-; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm17, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm3
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm27
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm30 = zmm30[2,3,2,3],zmm4[0,1,0,1]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm15
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm2[2,3,2,3],zmm4[0,1,0,1]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm26
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm2[2,3,2,3],zmm4[0,1,0,1]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm15
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm2[0,1,0,1],zmm1[0,1,0,1]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm17, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm27
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm29[2,3,2,3],zmm3[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm0
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm28 = zmm1[2,3,2,3],zmm3[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,2,3],zmm3[0,1,0,1]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm22
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm4
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm14
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm2[0,1,0,1],zmm1[0,1,0,1]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm11
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm31[0,1,0,1],zmm1[0,1,0,1]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm17, %ymm2
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm20[0,1,0,1]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm18[0,1,0,1]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm2[0,1,0,1],zmm1[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermd %ymm9, %ymm17, %ymm9
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm21[0,1,0,1]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm20[0,1,0,1]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm18[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm9 = zmm9[2,3,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm7 ^ (zmm10 & (zmm9 ^ zmm7))
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm9))
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm4))
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm4))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7))
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm12[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm0))
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm10 & (zmm0 ^ zmm7))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm24 ^ (zmm10 & (zmm21 ^ zmm24))
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm30 ^ (zmm7 & (zmm23 ^ zmm30))
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm10 = mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm13, %zmm10
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm13 = mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm3, %zmm13
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm10 ^ (zmm7 & (zmm13 ^ zmm10))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, (%rsp), %zmm12, %zmm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm7))
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm9))
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm29 ^ (mem & (zmm4 ^ zmm29))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm23))
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm14[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm22[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm14 = zmm14[2,3,2,3],mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm7 ^ (mem & (zmm14 ^ zmm7))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm4 ^ (zmm9 & (zmm14 ^ zmm4))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm23 ^ (zmm9 & (zmm10 ^ zmm23))
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm29 ^ (zmm4 & (zmm28 ^ zmm29))
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm7 = mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm7
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm9 = mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm15, %zmm9
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm7 ^ (zmm4 & (zmm9 ^ zmm7))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, (%rsp), %zmm12, %zmm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4))
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm0 & mem)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm7))
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm14))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm31 ^ (mem & (zmm3 ^ zmm31))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm28))
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm25 ^ (mem & (zmm5 ^ zmm25))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm21))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm10))
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[2,3,2,3]
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm13))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2))
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm16 ^ (mem & (zmm6 ^ zmm16))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9))
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm8))
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm16 ^ (mem & (zmm2 ^ zmm16))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm6))
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rax)
-; AVX512-FCP-NEXT:    addq $1416, %rsp # imm = 0x588
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 64(%rax)
+; AVX512-FCP-NEXT:    addq $1480, %rsp # imm = 0x5C8
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -9170,379 +9161,374 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm3
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm23
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm22
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm10
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
 ; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm10, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm9
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm8, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm7
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm7, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm9, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm8
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm15
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm5
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
-; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm5, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm15, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
+; AVX512DQ-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm7, %ymm1
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512DQ-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm3, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm17
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm10, %ymm2
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
-; AVX512DQ-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm7, %ymm1
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
-; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm8, %ymm3
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm6, %ymm1
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
-; AVX512DQ-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm5, %ymm13
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm13, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm13
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm13, %ymm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm14
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero,zero,ymm14[27],zero,ymm14[25]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23,u,u,u],zero,ymm13[26],zero,ymm13[24,u,u,u],zero,ymm13[27],zero
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
+; AVX512DQ-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm10, %ymm1
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm1, %ymm15
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm15, %ymm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512DQ-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm1, %ymm15
-; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm22
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm15, %ymm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm15
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm15, %ymm3
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm2, %ymm9
-; AVX512DQ-NEXT:    vpor %ymm3, %ymm9, %ymm3
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512DQ-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm9, %ymm21
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31]
-; AVX512DQ-NEXT:    vpor %ymm3, %ymm9, %ymm3
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm9
-; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm8, %ymm0
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
 ; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm9, %ymm12
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm20
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm12[2,3,2,3],zmm3[2,3,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm1, %ymm11
-; AVX512DQ-NEXT:    vpor %ymm3, %ymm11, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm9, %ymm3
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm15, %ymm3
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm7, %ymm6
+; AVX512DQ-NEXT:    vpor %ymm3, %ymm6, %ymm3
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm13
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm6
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm13[23],zero,ymm13[23,24,25,26],zero,ymm13[24],zero,ymm13[30,31]
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
 ; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm26
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm14, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm20
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vpor %ymm4, %ymm11, %ymm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = zero,ymm0[7],zero,zero,zero,zero,ymm0[10],zero,ymm0[8],zero,zero,zero,zero,ymm0[11],zero,ymm0[9],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = ymm1[9],zero,ymm1[7,u,u,u],zero,ymm1[10],zero,ymm1[8,u,u,u],zero,ymm1[11],zero,ymm1[25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero
+; AVX512DQ-NEXT:    vpor %ymm4, %ymm11, %ymm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm14[2,3,2,3],zmm11[2,3,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm11
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm3, %ymm12
+; AVX512DQ-NEXT:    vpor %ymm11, %ymm12, %ymm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX512DQ-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm18
+; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
 ; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm15, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm13, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm1
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm16
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm19
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm27
+; AVX512DQ-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm31
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm2, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm30
 ; AVX512DQ-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm31
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm30
-; AVX512DQ-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[4,u,u,u],zero,xmm1[7],zero,xmm1[5,u,u,u],zero,xmm1[8],zero,xmm1[6]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[4],zero,xmm3[u,u,u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm23
-; AVX512DQ-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm7, %ymm2
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm8, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm7, %ymm2
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm4
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm28
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[4],zero,xmm2[u,u,u,7],zero,xmm2[5],zero,xmm2[u,u,u,8],zero,xmm2[6],zero
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm22
+; AVX512DQ-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm18
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm9, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm10, %ymm1
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23,u,u,u],zero,ymm2[26],zero,ymm2[24,u,u,u],zero,ymm2[27],zero
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm7, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm4
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm2, %zmm16
 ; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm13, %xmm2
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm10
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
-; AVX512DQ-NEXT:    vporq %xmm2, %xmm0, %xmm26
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm11, %xmm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm8, %xmm2
-; AVX512DQ-NEXT:    vporq %xmm0, %xmm2, %xmm24
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm13, %xmm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm11, %xmm2
+; AVX512DQ-NEXT:    vporq %xmm0, %xmm2, %xmm25
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm12, %xmm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm9
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm2
+; AVX512DQ-NEXT:    vporq %xmm0, %xmm2, %xmm23
 ; AVX512DQ-NEXT:    vmovdqa 32(%rax), %xmm1
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6]
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm27
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm25
-; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm2, %zmm20
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm15
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm15[4,u,u,u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm26
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm2, %zmm19
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm6, %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm5
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[4],zero,xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero
-; AVX512DQ-NEXT:    vporq %xmm0, %xmm3, %xmm22
+; AVX512DQ-NEXT:    vporq %xmm0, %xmm3, %xmm20
 ; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm2
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,6]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
 ; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm4, %ymm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm4, %ymm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm6[0,1,0,1]
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm0[2,3,2,3],zmm7[0,1,0,1]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm0[2,3,2,3],zmm7[0,1,0,1]
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm0[2,3,2,3],zmm8[0,1,0,1]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm19[2,3,2,3],zmm0[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm21
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm17
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm16
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm28 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm28 = zmm4[0,1,0,1],mem[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm12
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15]
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm6 ^ (zmm14 & (zmm7 ^ zmm6))
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm6 = mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2]
+; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm1
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm27 = zmm0[0,1,0,1],mem[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm15
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm15, %xmm14
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm7 ^ (zmm15 & (zmm8 ^ zmm7))
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm7 = mem[2,3,2,3]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm31, %zmm6
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm31, %zmm7
 ; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm31 = mem[2,3,2,3]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm31
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm6 ^ (zmm14 & (zmm31 ^ zmm6))
-; AVX512DQ-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm6 = zmm12[0,1,0,1],mem[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm14
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
-; AVX512DQ-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm23 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm23 = zmm14[0,1,0,1],mem[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm14
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm31 = zmm7 ^ (zmm15 & (zmm31 ^ zmm7))
+; AVX512DQ-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm7 = zmm14[0,1,0,1],mem[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm15
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
+; AVX512DQ-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm22 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm22 = zmm15[0,1,0,1],mem[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm15
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[1,1,0,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm1, %zmm28
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm26[0,1,0,1],zmm10[0,1,0,1]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm24[0,1,0,1],zmm8[0,1,0,1]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm12, %xmm11
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm22[0,1,0,1],zmm2[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm5))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm11, %xmm11
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm11 = zmm25[0,1,0,1],zmm11[0,1,0,1]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm23[0,1,0,1],zmm9[0,1,0,1]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm14, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm20[0,1,0,1],zmm2[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm6 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm13 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm14 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm13 ^ (mem & (zmm14 ^ zmm13))
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm6 ^ (zmm13 & (zmm14 ^ zmm6))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm6 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm17[2,3,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm15
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[1,1,0,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm17 = zmm17[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm12 ^ (zmm13 & (zmm17 ^ zmm12))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm10 ^ (zmm13 & (zmm8 ^ zmm10))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm17[2,3,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm17 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm17 = zmm5[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm13 & (zmm17 ^ zmm6))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm11 ^ (zmm13 & (zmm9 ^ zmm11))
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm6 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0))
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm0 & mem)
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm4))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm15[0,0,1,0]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm14, %zmm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm19))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm5))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm4 = zmm4[0,1,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm4 & mem)
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm31))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm28))
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm23))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6))
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm4))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm17))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm20 ^ (mem & (zmm2 ^ zmm20))
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm8))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6))
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm6 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[0,0,1,0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm15, %zmm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm18))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm8))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm14))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & mem)
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm31))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm27))
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm28[0,0,1,0,4,4,5,4]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm22))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm7))
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm6))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm17))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm19 ^ (mem & (zmm2 ^ zmm19))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm9))
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 128(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm29, 384(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 192(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 64(%rax)
@@ -9552,1574 +9538,1506 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512DQ-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $1416, %rsp # imm = 0x588
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero
+; AVX512DQ-FCP-NEXT:    subq $1480, %rsp # imm = 0x5C8
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm9
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero,zero,zero,ymm9[18]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[0,1,14],zero,ymm6[12,13,0,1,14,15],zero,ymm6[3,12,13,2,3,16],zero,ymm6[30,31,28,29,16,17],zero,ymm6[31,18,19,28,29,18],zero
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[0,1,0,1,14],zero,ymm4[14,15,0,1,14,15],zero,ymm4[13,14,15,16,17,16],zero,ymm4[30,31,30,31,16,17],zero,ymm4[31,28,29,30,31]
-; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm11[13,u,u,u,u,u],zero,ymm11[14,u,u,u,u,u],zero,ymm11[15,u,u,u,u,u],zero,ymm11[16,u,u,u,u,u],zero,ymm11[17,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm12[13,u,u,u,u,u],zero,ymm12[14,u,u,u,u,u],zero,ymm12[15,u,u,u,u,u],zero,ymm12[16,u,u,u,u,u],zero,ymm12[17,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm23
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm18
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm7
-; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm7, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm14
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm10
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm7
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
 ; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm0
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm8
+; AVX512DQ-FCP-NEXT:    vpor %ymm7, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
 ; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm14, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm27
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm7[7],zero,zero,zero,zero,ymm7[10],zero,ymm7[8],zero,zero,zero,zero,ymm7[11],zero,ymm7[9],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero,ymm7[25]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm8[9],zero,ymm8[7,u,u,u],zero,ymm8[10],zero,ymm8[8,u,u,u],zero,ymm8[11],zero,ymm8[25],zero,ymm8[23,u,u,u],zero,ymm8[26],zero,ymm8[24,u,u,u],zero,ymm8[27],zero
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm14, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512DQ-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm16
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm14[2,3,2,3],zmm1[2,3,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm7
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm9[19],zero,ymm9[21,20,21,22],zero,ymm9[20],zero,ymm9[22,23]
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[18],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm7
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[21],zero,ymm9[19],zero,zero,zero,zero,ymm9[22],zero,ymm9[20],zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm6[19],zero,ymm6[21,20,21,22],zero,ymm6[20],zero,ymm6[22,23]
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm26
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero,ymm6[25]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23,u,u,u],zero,ymm9[26],zero,ymm9[24,u,u,u],zero,ymm9[27],zero
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[20],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm8[18],zero,ymm8[20,21,20,21],zero,ymm8[19],zero,ymm8[19,20,21,22],zero
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm29
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm5
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm5, %ymm30
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm2
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[1,1,0,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm0, %zmm29
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[1,1,0,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm24
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero,xmm5[u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,7],zero,xmm4[5],zero,xmm4[u,u,u,8],zero,xmm4[6],zero,xmm4[u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm28
 ; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm12, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm21
-; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm23
+; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,6]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm11
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm18
-; AVX512DQ-FCP-NEXT:    vporq %xmm2, %xmm6, %xmm31
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm13[0,1,0,1,14],zero,ymm13[14,15,0,1,14,15],zero,ymm13[13,14,15,16,17,16],zero,ymm13[30,31,30,31,16,17],zero,ymm13[31,28,29,30,31]
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm6, %ymm2
+; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm13[0,1,0,1,14],zero,ymm13[14,15,0,1,14,15],zero,ymm13[13,14,15,16,17,16],zero,ymm13[30,31,30,31,16,17],zero,ymm13[31,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm13[18,19,20,21],zero,ymm13[19],zero,ymm13[25,26,27,22],zero,ymm13[20],zero
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm15[0,1,14],zero,ymm15[12,13,0,1,14,15],zero,ymm15[3,12,13,2,3,16],zero,ymm15[30,31,28,29,16,17],zero,ymm15[31,18,19,28,29,18],zero
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm7[0,1,14],zero,ymm7[12,13,0,1,14,15],zero,ymm7[3,12,13,2,3,16],zero,ymm7[30,31,28,29,16,17],zero,ymm7[31,18,19,28,29,18],zero
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm15[19],zero,ymm15[21,20,21,22],zero,ymm15[20],zero,ymm15[22,23]
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22],zero,ymm8[20],zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm7[19],zero,ymm7[21,20,21,22],zero,ymm7[20],zero,ymm7[22,23]
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm14[14],zero,zero,zero,zero,zero,zero,ymm14[15],zero,zero,zero,zero,zero,zero,ymm14[16],zero,zero,zero,zero,zero,zero,ymm14[17],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[13,u,u,u,u,u],zero,ymm10[14,u,u,u,u,u],zero,ymm10[15,u,u,u,u,u],zero,ymm10[16,u,u,u,u,u],zero,ymm10[17,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT:    vporq %xmm1, %xmm0, %xmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u]
-; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
+; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm12
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,7],zero,xmm6[5],zero,xmm6[u,u,u,8],zero,xmm6[6],zero,xmm6[u,u]
+; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm9, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm2, %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm15
-; AVX512DQ-FCP-NEXT:    vporq %xmm3, %xmm15, %xmm18
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3]
-; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm14
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[1,1,0,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm11
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm13
+; AVX512DQ-FCP-NEXT:    vporq %xmm10, %xmm13, %xmm18
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
+; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm9, %ymm10, %ymm9
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm13
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm3[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [4,5,4,5,5,7,4,5,20,21,22,23,20,21,22,23]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm17, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm27
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm30 = zmm30[2,3,2,3],zmm4[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm15
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm2[2,3,2,3],zmm4[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm26
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm2[2,3,2,3],zmm4[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm15
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm2[0,1,0,1],zmm1[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm17, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm27
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm29[2,3,2,3],zmm3[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm28 = zmm1[2,3,2,3],zmm3[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,2,3],zmm3[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm22
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm2[0,1,0,1],zmm1[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm31[0,1,0,1],zmm1[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm17, %ymm2
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm20[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm18[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm2[0,1,0,1],zmm1[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm9, %ymm17, %ymm9
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm21[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm20[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm18[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm9 = zmm9[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm7 ^ (zmm10 & (zmm9 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm9))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm4))
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm4))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7))
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm12[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm10 & (zmm0 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm24 ^ (zmm10 & (zmm21 ^ zmm24))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm30 ^ (zmm7 & (zmm23 ^ zmm30))
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm10 = mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm13, %zmm10
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm13 = mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm3, %zmm13
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm10 ^ (zmm7 & (zmm13 ^ zmm10))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, (%rsp), %zmm12, %zmm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm7))
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm9))
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm29 ^ (mem & (zmm4 ^ zmm29))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm23))
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm14[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm22[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm14 = zmm14[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm7 ^ (mem & (zmm14 ^ zmm7))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm4 ^ (zmm9 & (zmm14 ^ zmm4))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm23 ^ (zmm9 & (zmm10 ^ zmm23))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm28 = zmm29 ^ (zmm4 & (zmm28 ^ zmm29))
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm7 = mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm9 = mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm15, %zmm9
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm7 ^ (zmm4 & (zmm9 ^ zmm7))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, (%rsp), %zmm12, %zmm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4))
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm0 & mem)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm7))
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm14))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm31 ^ (mem & (zmm3 ^ zmm31))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm28))
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm25 ^ (mem & (zmm5 ^ zmm25))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm21))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm10))
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[2,3,2,3]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm13))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2))
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm16 ^ (mem & (zmm6 ^ zmm16))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9))
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm8))
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm16 ^ (mem & (zmm2 ^ zmm16))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm6))
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rax)
-; AVX512DQ-FCP-NEXT:    addq $1416, %rsp # imm = 0x588
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 64(%rax)
+; AVX512DQ-FCP-NEXT:    addq $1480, %rsp # imm = 0x5C8
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i8_stride7_vf64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm0
-; AVX512BW-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm14
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
-; AVX512BW-NEXT:    movl $338170920, %r10d # imm = 0x14281428
-; AVX512BW-NEXT:    kmovd %r10d, %k2
-; AVX512BW-NEXT:    vpshufb %ymm4, %ymm2, %ymm1 {%k2}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512BW-NEXT:    vpshufb %ymm10, %ymm14, %ymm5
-; AVX512BW-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[2,3,2,3],zmm1[2,3,2,3]
-; AVX512BW-NEXT:    vmovdqa 32(%rdx), %ymm15
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512BW-NEXT:    vpshufb %ymm5, %ymm15, %ymm3
-; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %ymm17
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpshufb %ymm11, %ymm17, %ymm6
-; AVX512BW-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
-; AVX512BW-NEXT:    vpshufb %ymm6, %ymm17, %ymm7
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7]
+; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512BW-NEXT:    vmovdqa 32(%rdx), %ymm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512BW-NEXT:    vpshufb %ymm9, %ymm0, %ymm2
+; AVX512BW-NEXT:    vmovdqa 32(%rcx), %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512BW-NEXT:    vpshufb %ymm11, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20]
+; AVX512BW-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[0,0,1,1,4,4,5,5]
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
 ; AVX512BW-NEXT:    movl $676341840, %r10d # imm = 0x28502850
-; AVX512BW-NEXT:    kmovd %r10d, %k3
-; AVX512BW-NEXT:    vmovdqu8 %ymm8, %ymm7 {%k3}
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm7[2,3,2,3]
-; AVX512BW-NEXT:    movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
-; AVX512BW-NEXT:    kmovq %r10, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 32(%r9), %ymm16
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm16, %ymm3
-; AVX512BW-NEXT:    vmovdqa64 32(%r8), %ymm18
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512BW-NEXT:    vpshufb %ymm8, %ymm18, %ymm12
-; AVX512BW-NEXT:    vpor %ymm3, %ymm12, %ymm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512BW-NEXT:    vpshufb %ymm12, %ymm18, %ymm19
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512BW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpshufb %ymm13, %ymm16, %ymm20
-; AVX512BW-NEXT:    vporq %ymm19, %ymm20, %ymm19
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm19[2,3,2,3],zmm3[2,3,2,3]
-; AVX512BW-NEXT:    movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
-; AVX512BW-NEXT:    kmovq %r10, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 32(%rax), %ymm3
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm19 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
-; AVX512BW-NEXT:    vpermw %zmm3, %zmm19, %zmm19
-; AVX512BW-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
-; AVX512BW-NEXT:    kmovq %rax, %k4
-; AVX512BW-NEXT:    vmovdqu8 %zmm19, %zmm1 {%k4}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512BW-NEXT:    vpshufb %ymm19, %ymm15, %ymm21
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512BW-NEXT:    vpshufb %ymm20, %ymm17, %ymm22
-; AVX512BW-NEXT:    vporq %ymm21, %ymm22, %ymm21
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512BW-NEXT:    vpshufb %ymm22, %ymm15, %ymm15
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512BW-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpshufb %ymm25, %ymm17, %ymm17
-; AVX512BW-NEXT:    vporq %ymm15, %ymm17, %ymm15
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm15, %zmm21, %zmm15
-; AVX512BW-NEXT:    vpshuflw {{.*#+}} ymm17 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm17 = ymm17[0,0,1,1,4,4,5,5]
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm21 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
-; AVX512BW-NEXT:    vpshufb %ymm21, %ymm2, %ymm17 {%k3}
+; AVX512BW-NEXT:    kmovd %r10d, %k1
+; AVX512BW-NEXT:    vpshufb %ymm4, %ymm3, %ymm6 {%k1}
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
+; AVX512BW-NEXT:    vpshufb %ymm13, %ymm5, %ymm7
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512BW-NEXT:    vpshufb %ymm14, %ymm3, %ymm10
+; AVX512BW-NEXT:    vpor %ymm7, %ymm10, %ymm7
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm20
+; AVX512BW-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
+; AVX512BW-NEXT:    kmovq %r10, %k3
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm20 {%k3}
+; AVX512BW-NEXT:    vmovdqa 32(%r9), %ymm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
+; AVX512BW-NEXT:    vpshufb %ymm15, %ymm2, %ymm6
+; AVX512BW-NEXT:    vmovdqa 32(%r8), %ymm12
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512BW-NEXT:    vpshufb %ymm16, %ymm12, %ymm10
+; AVX512BW-NEXT:    vpor %ymm6, %ymm10, %ymm6
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm17 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm12[18],zero,ymm12[20,21,20,21],zero,ymm12[19],zero,ymm12[19,20,21,22],zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm18 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
+; AVX512BW-NEXT:    vporq %ymm17, %ymm18, %ymm17
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512BW-NEXT:    vpshufb %ymm23, %ymm14, %ymm14
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512BW-NEXT:    vpshufb %ymm24, %ymm2, %ymm2
-; AVX512BW-NEXT:    vpor %ymm2, %ymm14, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm2, %zmm2
-; AVX512BW-NEXT:    movabsq $3485998880071096368, %rax # imm = 0x3060C183060C1830
-; AVX512BW-NEXT:    kmovq %rax, %k4
-; AVX512BW-NEXT:    vmovdqu8 %zmm15, %zmm2 {%k4}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512BW-NEXT:    vpshufb %ymm14, %ymm16, %ymm17
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512BW-NEXT:    vpshufb %ymm15, %ymm18, %ymm26
-; AVX512BW-NEXT:    vporq %ymm17, %ymm26, %ymm17
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512BW-NEXT:    vpshufb %ymm26, %ymm18, %ymm18
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512BW-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpshufb %ymm28, %ymm16, %ymm16
-; AVX512BW-NEXT:    vporq %ymm18, %ymm16, %ymm16
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm16, %zmm17, %zmm16
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm29 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512BW-NEXT:    vpermw %ymm3, %ymm29, %ymm17
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm18 = ymm3[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm18, %zmm17
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm6, %zmm17
+; AVX512BW-NEXT:    vmovdqa 32(%rax), %ymm8
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512BW-NEXT:    vpermw %ymm8, %ymm6, %ymm18
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm19 = ymm8[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm18, %zmm19, %zmm18
 ; AVX512BW-NEXT:    movabsq $145249953336295682, %rax # imm = 0x204081020408102
-; AVX512BW-NEXT:    kmovq %rax, %k5
-; AVX512BW-NEXT:    vmovdqu8 %zmm17, %zmm16 {%k5}
+; AVX512BW-NEXT:    kmovq %rax, %k2
+; AVX512BW-NEXT:    vmovdqu8 %zmm18, %zmm17 {%k2}
 ; AVX512BW-NEXT:    movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3
+; AVX512BW-NEXT:    kmovq %rax, %k2
+; AVX512BW-NEXT:    vmovdqu8 %zmm17, %zmm20 {%k2}
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm17 = ymm5[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm17 = ymm17[2,2,3,3,6,6,7,7]
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
+; AVX512BW-NEXT:    movl $338170920, %eax # imm = 0x14281428
+; AVX512BW-NEXT:    kmovd %eax, %k4
+; AVX512BW-NEXT:    vpshufb %ymm18, %ymm3, %ymm17 {%k4}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[9],zero,ymm3[7],zero,zero,zero,zero,ymm3[10],zero,ymm3[8],zero,zero,zero,zero,ymm3[11],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm5[7],zero,zero,zero,zero,ymm5[10],zero,ymm5[8],zero,zero,zero,zero,ymm5[11],zero,ymm5[9],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero,ymm5[25]
+; AVX512BW-NEXT:    vpor %ymm3, %ymm5, %ymm3
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm3[2,3,2,3],zmm17[2,3,2,3]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[8,9],zero,ymm0[7],zero,ymm0[5,6,7,10],zero,ymm0[8],zero,ymm0[12,13,10,11,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm17 = zero,zero,ymm1[9],zero,ymm1[7],zero,zero,zero,zero,ymm1[10],zero,ymm1[8],zero,zero,zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero
+; AVX512BW-NEXT:    vporq %ymm3, %ymm17, %ymm3
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm26 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
+; AVX512BW-NEXT:    vpshufb %ymm26, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7]
+; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm3[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-NEXT:    movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18
+; AVX512BW-NEXT:    kmovq %rax, %k2
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm7 {%k2}
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
+; AVX512BW-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpshufb %ymm24, %ymm2, %ymm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512BW-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpshufb %ymm25, %ymm12, %ymm1
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[8,9,8,9],zero,ymm12[7],zero,ymm12[7,8,9,10],zero,ymm12[8],zero,ymm12[14,15,24,25,24,25],zero,ymm12[23],zero,ymm12[23,24,25,26],zero,ymm12[24],zero,ymm12[30,31]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
+; AVX512BW-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[2,3,2,3]
+; AVX512BW-NEXT:    movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060
+; AVX512BW-NEXT:    kmovq %rax, %k2
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm7 {%k2}
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
+; AVX512BW-NEXT:    vpermw %zmm8, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
 ; AVX512BW-NEXT:    kmovq %rax, %k5
-; AVX512BW-NEXT:    vmovdqu8 %zmm16, %zmm2 {%k5}
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %ymm16
-; AVX512BW-NEXT:    vpshufb %ymm5, %ymm16, %ymm5
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %ymm17
-; AVX512BW-NEXT:    vpshufb %ymm11, %ymm17, %ymm11
-; AVX512BW-NEXT:    vpor %ymm5, %ymm11, %ymm5
-; AVX512BW-NEXT:    vpshufb %ymm22, %ymm16, %ymm11
-; AVX512BW-NEXT:    vpshufb %ymm25, %ymm17, %ymm18
-; AVX512BW-NEXT:    vporq %ymm11, %ymm18, %ymm11
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm11[2,3,2,3],zmm5[2,3,2,3]
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %ymm25
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm27
-; AVX512BW-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm27[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,0,1,1,4,4,5,5]
-; AVX512BW-NEXT:    vpshufb %ymm21, %ymm25, %ymm11 {%k3}
-; AVX512BW-NEXT:    vpshufb %ymm9, %ymm25, %ymm9
-; AVX512BW-NEXT:    vpshufb %ymm10, %ymm27, %ymm10
-; AVX512BW-NEXT:    vpor %ymm9, %ymm10, %ymm9
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm11[2,3,2,3],zmm9[2,3,2,3]
-; AVX512BW-NEXT:    vmovdqu8 %zmm9, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %ymm18
-; AVX512BW-NEXT:    vpshufb %ymm12, %ymm18, %ymm9
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %ymm21
-; AVX512BW-NEXT:    vpshufb %ymm13, %ymm21, %ymm10
-; AVX512BW-NEXT:    vpor %ymm9, %ymm10, %ymm9
-; AVX512BW-NEXT:    vpshufb %ymm26, %ymm18, %ymm10
-; AVX512BW-NEXT:    vpshufb %ymm28, %ymm21, %ymm11
-; AVX512BW-NEXT:    vpor %ymm10, %ymm11, %ymm10
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3]
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm29, %zmm10
-; AVX512BW-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
-; AVX512BW-NEXT:    kmovq %rax, %k5
-; AVX512BW-NEXT:    vmovdqu8 %zmm10, %zmm9 {%k5}
-; AVX512BW-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
-; AVX512BW-NEXT:    kmovq %rax, %k5
-; AVX512BW-NEXT:    vmovdqu8 %zmm9, %zmm5 {%k5}
-; AVX512BW-NEXT:    vpshufb %ymm19, %ymm16, %ymm9
-; AVX512BW-NEXT:    vpshufb %ymm20, %ymm17, %ymm10
-; AVX512BW-NEXT:    vpor %ymm9, %ymm10, %ymm9
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %xmm19
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm11
-; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %xmm20
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512BW-NEXT:    vpshufb %xmm22, %xmm12, %xmm12
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm12, %zmm26
-; AVX512BW-NEXT:    vpshufb %ymm23, %ymm27, %ymm9
-; AVX512BW-NEXT:    vpshufb %ymm24, %ymm25, %ymm12
-; AVX512BW-NEXT:    vpor %ymm9, %ymm12, %ymm9
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm13
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm23 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512BW-NEXT:    vpshufb %xmm23, %xmm24, %xmm24
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm24, %zmm9
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm7 {%k5}
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %ymm22
+; AVX512BW-NEXT:    vpshufb %ymm9, %ymm22, %ymm0
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %ymm23
+; AVX512BW-NEXT:    vpshufb %ymm11, %ymm23, %ymm1
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm11
+; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm12
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm28
+; AVX512BW-NEXT:    vpshufb %ymm13, %ymm28, %ymm1
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %ymm31
+; AVX512BW-NEXT:    vpshufb %ymm14, %ymm31, %ymm2
+; AVX512BW-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm13
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512BW-NEXT:    vpshufb %xmm19, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm9
 ; AVX512BW-NEXT:    movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
 ; AVX512BW-NEXT:    kmovq %rax, %k5
-; AVX512BW-NEXT:    vmovdqu8 %zmm26, %zmm9 {%k5}
-; AVX512BW-NEXT:    vpshufb %ymm14, %ymm21, %ymm14
-; AVX512BW-NEXT:    vpshufb %ymm15, %ymm18, %ymm15
-; AVX512BW-NEXT:    vporq %ymm14, %ymm15, %ymm24
-; AVX512BW-NEXT:    vmovdqa (%r9), %xmm14
-; AVX512BW-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm28 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512BW-NEXT:    vpshufb %xmm26, %xmm28, %xmm28
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm24, %zmm28, %zmm24
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm28, %zmm28
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm9 {%k5}
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %ymm29
+; AVX512BW-NEXT:    vpshufb %ymm15, %ymm29, %ymm0
+; AVX512BW-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512BW-NEXT:    vpshufb %ymm16, %ymm6, %ymm1
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vmovdqa (%r9), %xmm15
+; AVX512BW-NEXT:    vmovdqa64 (%r8), %xmm16
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
+; AVX512BW-NEXT:    vpermw %zmm21, %zmm1, %zmm1
 ; AVX512BW-NEXT:    movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
 ; AVX512BW-NEXT:    kmovq %rax, %k5
-; AVX512BW-NEXT:    vmovdqu8 %zmm28, %zmm24 {%k5}
-; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm28
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k5}
+; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm27
 ; AVX512BW-NEXT:    movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
 ; AVX512BW-NEXT:    kmovq %rax, %k5
-; AVX512BW-NEXT:    vmovdqu8 %zmm24, %zmm9 {%k5}
-; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %xmm29
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm24 = ymm27[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm27 = ymm24[2,2,3,3,6,6,7,7]
-; AVX512BW-NEXT:    vpshufb %ymm4, %ymm25, %ymm27 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm24 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512BW-NEXT:    vpshufb %xmm24, %xmm4, %xmm4
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm27 = zmm27[2,3,2,3],zmm4[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm9 {%k5}
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm28[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX512BW-NEXT:    vpshufb %ymm18, %ymm31, %ymm0 {%k4}
+; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %xmm30
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm27[0],xmm30[0],xmm27[1],xmm30[1],xmm27[2],xmm30[2],xmm27[3],xmm30[3],xmm27[4],xmm30[4],xmm27[5],xmm30[5],xmm27[6],xmm30[6],xmm27[7],xmm30[7]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512BW-NEXT:    vpshufb %xmm17, %xmm1, %xmm1
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm0[2,3,2,3],zmm1[0,1,0,1]
+; AVX512BW-NEXT:    vpshufb %ymm26, %ymm23, %ymm18
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm22[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7]
+; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm18 {%k1}
+; AVX512BW-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX512BW-NEXT:    vmovdqa 32(%rcx), %xmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512BW-NEXT:    vpshufb %xmm26, %xmm0, %xmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm18 {%k3}
+; AVX512BW-NEXT:    vpshufb %ymm24, %ymm29, %ymm0
+; AVX512BW-NEXT:    vpshufb %ymm25, %ymm6, %ymm3
+; AVX512BW-NEXT:    vpor %ymm0, %ymm3, %ymm1
 ; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm25
-; AVX512BW-NEXT:    vpshufb %ymm6, %ymm17, %ymm4
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm20[0],xmm19[1],xmm20[1],xmm19[2],xmm20[2],xmm19[3],xmm20[3],xmm19[4],xmm20[4],xmm19[5],xmm20[5],xmm19[6],xmm20[6],xmm19[7],xmm20[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm16 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm16 = ymm16[0,2,3,3,4,6,7,7]
-; AVX512BW-NEXT:    vmovdqu8 %ymm16, %ymm4 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512BW-NEXT:    vpshufb %xmm16, %xmm6, %xmm6
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[2,3,2,3],zmm6[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 32(%r8), %xmm17
-; AVX512BW-NEXT:    vmovdqu8 %zmm27, %zmm4 {%k4}
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm21, %ymm6
-; AVX512BW-NEXT:    vpshufb %ymm8, %ymm18, %ymm7
-; AVX512BW-NEXT:    vpor %ymm6, %ymm7, %ymm7
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm17[0],xmm25[0],xmm17[1],xmm25[1],xmm17[2],xmm25[2],xmm17[3],xmm25[3],xmm17[4],xmm25[4],xmm17[5],xmm25[5],xmm17[6],xmm25[6],xmm17[7],xmm25[7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512BW-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm7[2,3,2,3],zmm8[0,1,0,1]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm8, %zmm8
+; AVX512BW-NEXT:    vmovdqa 32(%r8), %xmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm25[0],xmm3[1],xmm25[1],xmm3[2],xmm25[2],xmm3[3],xmm25[3],xmm3[4],xmm25[4],xmm3[5],xmm25[5],xmm3[6],xmm25[6],xmm3[7],xmm25[7]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm24 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512BW-NEXT:    vpshufb %xmm24, %xmm0, %xmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
+; AVX512BW-NEXT:    vpermw %zmm21, %zmm1, %zmm1
 ; AVX512BW-NEXT:    movabsq $580999813345182728, %rax # imm = 0x810204081020408
-; AVX512BW-NEXT:    kmovq %rax, %k2
-; AVX512BW-NEXT:    vmovdqu8 %zmm8, %zmm7 {%k2}
+; AVX512BW-NEXT:    kmovq %rax, %k3
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k3}
 ; AVX512BW-NEXT:    movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
-; AVX512BW-NEXT:    kmovq %rax, %k2
-; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm4 {%k2}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm20, %xmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512BW-NEXT:    vpshufb %xmm18, %xmm19, %xmm21
-; AVX512BW-NEXT:    vporq %xmm8, %xmm21, %xmm8
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
-; AVX512BW-NEXT:    vpshufb %xmm22, %xmm19, %xmm19
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm29, %xmm20
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512BW-NEXT:    vpshufb %xmm21, %xmm28, %xmm22
-; AVX512BW-NEXT:    vporq %xmm20, %xmm22, %xmm20
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
-; AVX512BW-NEXT:    vpshufb %xmm23, %xmm22, %xmm22
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,0,1],zmm22[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqu8 %zmm8, %zmm20 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm25, %xmm22
+; AVX512BW-NEXT:    kmovq %rax, %k3
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm18 {%k3}
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm28[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512BW-NEXT:    vpshufb %ymm4, %ymm31, %ymm0 {%k1}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm31[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm31[23],zero,zero,zero,zero,ymm31[26],zero,ymm31[24],zero,zero,zero,zero,ymm31[27],zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm28[23],zero,zero,zero,zero,ymm28[26],zero,ymm28[24],zero,zero,zero,zero,ymm28[27],zero,ymm28[25]
+; AVX512BW-NEXT:    vpor %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm22[18,19,20,21],zero,ymm22[19],zero,ymm22[25,26,27,22],zero,ymm22[20],zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm23[18],zero,zero,zero,zero,ymm23[21],zero,ymm23[19],zero,zero,zero,zero,ymm23[22],zero,ymm23[20]
+; AVX512BW-NEXT:    vpor %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm22[23],zero,ymm22[21,22,23,26],zero,ymm22[24],zero,ymm22[28,29,26,27]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm22 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm23[25],zero,ymm23[23],zero,zero,zero,zero,ymm23[26],zero,ymm23[24],zero,zero,zero,zero
+; AVX512BW-NEXT:    vporq %ymm4, %ymm22, %ymm4
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm1[2,3,2,3],zmm4[2,3,2,3]
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k2}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,ymm6[20,21,20,21],zero,ymm6[19],zero,ymm6[19,20,21,22],zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm29[20],zero,ymm29[18],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22]
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero
+; AVX512BW-NEXT:    vpor %ymm1, %ymm6, %ymm1
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512BW-NEXT:    vpermw %zmm21, %zmm1, %zmm1
+; AVX512BW-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
+; AVX512BW-NEXT:    kmovq %rax, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
+; AVX512BW-NEXT:    kmovq %rax, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm5, %xmm22
+; AVX512BW-NEXT:    vporq %xmm1, %xmm22, %xmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,0,1],zmm2[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm30, %xmm5
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512BW-NEXT:    vpshufb %xmm22, %xmm27, %xmm23
+; AVX512BW-NEXT:    vporq %xmm5, %xmm23, %xmm5
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm30[8],xmm27[8],xmm30[9],xmm27[9],xmm30[10],xmm27[10],xmm30[11],xmm27[11],xmm30[12],xmm27[12],xmm30[13],xmm27[13],xmm30[14],xmm27[14],xmm30[15],xmm27[15]
+; AVX512BW-NEXT:    vpshufb %xmm19, %xmm23, %xmm19
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm19[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm5 {%k2}
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm25, %xmm19
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm23 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512BW-NEXT:    vpshufb %xmm23, %xmm17, %xmm27
-; AVX512BW-NEXT:    vporq %xmm22, %xmm27, %xmm22
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm25[8],xmm17[9],xmm25[9],xmm17[10],xmm25[10],xmm17[11],xmm25[11],xmm17[12],xmm25[12],xmm17[13],xmm25[13],xmm17[14],xmm25[14],xmm17[15],xmm25[15]
-; AVX512BW-NEXT:    vpshufb %xmm26, %xmm17, %xmm17
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm22[0,1,0,1],zmm17[0,1,0,1]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512BW-NEXT:    vpermw %zmm3, %zmm22, %zmm3
+; AVX512BW-NEXT:    vpshufb %xmm23, %xmm3, %xmm27
+; AVX512BW-NEXT:    vporq %xmm19, %xmm27, %xmm19
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm25[8],xmm3[9],xmm25[9],xmm3[10],xmm25[10],xmm3[11],xmm25[11],xmm3[12],xmm25[12],xmm3[13],xmm25[13],xmm3[14],xmm25[14],xmm3[15],xmm25[15]
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm19[0,1,0,1],zmm3[0,1,0,1]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512BW-NEXT:    vpermw %zmm8, %zmm10, %zmm8
 ; AVX512BW-NEXT:    movabsq $290499906672591364, %rax # imm = 0x408102040810204
 ; AVX512BW-NEXT:    kmovq %rax, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm17 {%k1}
+; AVX512BW-NEXT:    vmovdqu8 %zmm8, %zmm3 {%k1}
 ; AVX512BW-NEXT:    movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
 ; AVX512BW-NEXT:    kmovq %rax, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm17, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm11, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm18, %xmm10, %xmm7
-; AVX512BW-NEXT:    vpor %xmm3, %xmm7, %xmm3
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
-; AVX512BW-NEXT:    vpshufb %xmm16, %xmm7, %xmm7
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,0,1],zmm3[0,1,0,1]
-; AVX512BW-NEXT:    vpshufb %xmm19, %xmm13, %xmm7
-; AVX512BW-NEXT:    vpshufb %xmm21, %xmm12, %xmm10
-; AVX512BW-NEXT:    vpor %xmm7, %xmm10, %xmm7
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512BW-NEXT:    vpshufb %xmm24, %xmm10, %xmm10
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm10[0,1,0,1],zmm7[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm5 {%k1}
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm11, %xmm3
+; AVX512BW-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; AVX512BW-NEXT:    vpshufb %xmm26, %xmm3, %xmm3
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,0,1],zmm0[0,1,0,1]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm22, %xmm13, %xmm3
+; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
+; AVX512BW-NEXT:    vpshufb %xmm17, %xmm3, %xmm3
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1]
 ; AVX512BW-NEXT:    movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
 ; AVX512BW-NEXT:    kmovq %rax, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm7 {%k1}
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm14, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm23, %xmm15, %xmm8
-; AVX512BW-NEXT:    vpor %xmm3, %xmm8, %xmm3
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; AVX512BW-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm6[0,1,0,1],zmm3[0,1,0,1]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm6, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm15, %xmm0
+; AVX512BW-NEXT:    vpshufb %xmm23, %xmm16, %xmm1
+; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512BW-NEXT:    vpshufb %xmm24, %xmm1, %xmm1
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
+; AVX512BW-NEXT:    vpermw %zmm21, %zmm1, %zmm1
 ; AVX512BW-NEXT:    movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
 ; AVX512BW-NEXT:    kmovq %rax, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
 ; AVX512BW-NEXT:    kmovq %rax, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm7 {%k1}
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 192(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 384(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm1
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128]
-; AVX512BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm11
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm2
-; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm2
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512BW-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm5
-; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm5, %ymm2
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm0[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm16
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm16, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm17
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512BW-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm17, %ymm5
-; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512BW-FCP-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm17, %ymm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm16, %ymm6
-; AVX512BW-FCP-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[2,3,2,3],zmm0[2,3,2,3]
-; AVX512BW-FCP-NEXT:    movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
-; AVX512BW-FCP-NEXT:    kmovq %r10, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %ymm18
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
-; AVX512BW-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm18, %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %ymm22
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm22, %ymm12
-; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm12, %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm12, %ymm22, %ymm14
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512BW-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm18, %ymm15
-; AVX512BW-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm14[2,3,2,3],zmm2[2,3,2,3]
-; AVX512BW-FCP-NEXT:    movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
-; AVX512BW-FCP-NEXT:    kmovq %r10, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm14, %zmm14
-; AVX512BW-FCP-NEXT:    movabsq $-9150747060186627967, %r10 # imm = 0x8102040810204081
-; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm0 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm21, %ymm1, %ymm23
-; AVX512BW-FCP-NEXT:    vporq %ymm15, %ymm23, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm11, %ymm11
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512BW-FCP-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm25, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm11, %ymm1
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm15, %zmm23
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm11, %ymm16, %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm17, %ymm26
-; AVX512BW-FCP-NEXT:    vporq %ymm1, %ymm26, %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm26, %ymm16, %ymm16
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
-; AVX512BW-FCP-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm27, %ymm17, %ymm17
-; AVX512BW-FCP-NEXT:    vporq %ymm16, %ymm17, %ymm16
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm18
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm18, %ymm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm26
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm11, %ymm26, %ymm1
+; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm18[18,19,20,21],zero,ymm18[19],zero,ymm18[25,26,27,22],zero,ymm18[20],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm26[18],zero,zero,zero,zero,ymm26[21],zero,ymm26[19],zero,zero,zero,zero,ymm26[22],zero,ymm26[20]
+; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm28
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm28, %ymm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm29
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm12, %ymm29, %ymm4
+; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm28[19],zero,ymm28[21,20,21,22],zero,ymm28[20],zero,ymm28[22,23]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22],zero,ymm29[20],zero,zero
+; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm25
 ; AVX512BW-FCP-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
-; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm23, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm16, %ymm18, %ymm23
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm17, %ymm22, %ymm28
-; AVX512BW-FCP-NEXT:    vporq %ymm23, %ymm28, %ymm23
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm28 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm22, %ymm22
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm29 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512BW-FCP-NEXT:    # ymm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm18, %ymm18
-; AVX512BW-FCP-NEXT:    vporq %ymm22, %ymm18, %ymm18
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm23, %zmm18
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm30 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512BW-FCP-NEXT:    vpermw %ymm2, %ymm30, %ymm22
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm23 = ymm2[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm23, %zmm22
+; AVX512BW-FCP-NEXT:    kmovq %r10, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm25 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %ymm30
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm30, %ymm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %ymm31
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm31, %ymm4
+; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm31[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm31[18],zero,ymm31[20,21,20,21],zero,ymm31[19],zero,ymm31[19,20,21,22],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm30[20],zero,ymm30[18],zero,zero,zero,zero,ymm30[21],zero,ymm30[19],zero,zero,zero,zero,ymm30[22]
+; AVX512BW-FCP-NEXT:    vpor %ymm9, %ymm13, %ymm9
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm3, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rax), %ymm16
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512BW-FCP-NEXT:    vpermw %ymm16, %ymm0, %ymm13
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm16[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm17, %zmm13
 ; AVX512BW-FCP-NEXT:    movabsq $145249953336295682, %r10 # imm = 0x204081020408102
-; AVX512BW-FCP-NEXT:    kmovq %r10, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm18 {%k3}
+; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm13, %zmm9 {%k2}
 ; AVX512BW-FCP-NEXT:    movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3
-; AVX512BW-FCP-NEXT:    kmovq %r10, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm1 {%k3}
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm18, %ymm19
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm22
-; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm22, %ymm20
-; AVX512BW-FCP-NEXT:    vporq %ymm19, %ymm20, %ymm19
-; AVX512BW-FCP-NEXT:    vpshufb %ymm26, %ymm22, %ymm20
-; AVX512BW-FCP-NEXT:    vpshufb %ymm27, %ymm18, %ymm23
-; AVX512BW-FCP-NEXT:    vporq %ymm20, %ymm23, %ymm20
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm20[2,3,2,3],zmm19[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %ymm19
-; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm19, %ymm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %ymm23
-; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm23, %ymm10
-; AVX512BW-FCP-NEXT:    vpor %ymm8, %ymm10, %ymm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm10
-; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm19, %ymm24
-; AVX512BW-FCP-NEXT:    vpshufb %ymm25, %ymm23, %ymm25
-; AVX512BW-FCP-NEXT:    vporq %ymm24, %ymm25, %ymm24
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm24[2,3,2,3],zmm8[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm20, %zmm8 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %ymm20
-; AVX512BW-FCP-NEXT:    vpshufb %ymm12, %ymm20, %ymm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %ymm24
-; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm24, %ymm13
-; AVX512BW-FCP-NEXT:    vpor %ymm12, %ymm13, %ymm12
-; AVX512BW-FCP-NEXT:    vpshufb %ymm28, %ymm20, %ymm13
-; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm24, %ymm25
-; AVX512BW-FCP-NEXT:    vporq %ymm13, %ymm25, %ymm13
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm13[2,3,2,3],zmm12[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vpermw %zmm10, %zmm30, %zmm13
-; AVX512BW-FCP-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm13, %zmm12 {%k3}
-; AVX512BW-FCP-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm12, %zmm8 {%k3}
-; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm19, %ymm12
-; AVX512BW-FCP-NEXT:    vpshufb %ymm21, %ymm23, %ymm13
-; AVX512BW-FCP-NEXT:    vpor %ymm12, %ymm13, %ymm14
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm12
-; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm13
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm25 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm25, %xmm25
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm25, %zmm26
-; AVX512BW-FCP-NEXT:    vpshufb %ymm11, %ymm22, %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm18, %ymm14
-; AVX512BW-FCP-NEXT:    vpor %ymm11, %ymm14, %ymm11
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm14
-; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm15
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm27 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm27, %xmm27
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm27, %zmm11
-; AVX512BW-FCP-NEXT:    movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm26, %zmm11 {%k3}
-; AVX512BW-FCP-NEXT:    vpshufb %ymm16, %ymm24, %ymm16
-; AVX512BW-FCP-NEXT:    vpshufb %ymm17, %ymm20, %ymm17
-; AVX512BW-FCP-NEXT:    vporq %ymm16, %ymm17, %ymm27
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %xmm16
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %xmm17
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm28 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm28, %xmm28
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm27, %zmm28, %zmm27
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm10, %zmm28, %zmm28
+; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm25 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm13
+; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm11
+; AVX512BW-FCP-NEXT:    vpor %ymm13, %ymm11, %ymm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm17
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %xmm19
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm27 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm22
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm8
+; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm13
+; AVX512BW-FCP-NEXT:    vpshufb %ymm12, %ymm13, %ymm12
+; AVX512BW-FCP-NEXT:    vpor %ymm8, %ymm12, %ymm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm20
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm21
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm12, %zmm8
+; AVX512BW-FCP-NEXT:    movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306
+; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm8 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %ymm12
+; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm22
+; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %ymm14
+; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm14, %ymm15
+; AVX512BW-FCP-NEXT:    vporq %ymm22, %ymm15, %ymm15
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %xmm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %xmm23
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm15
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm2, %zmm2
 ; AVX512BW-FCP-NEXT:    movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm28, %zmm27 {%k3}
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm28
+; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k2}
 ; AVX512BW-FCP-NEXT:    movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
+; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm8 {%k2}
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128]
+; AVX512BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm26, %ymm2
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512BW-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm18, %ymm3
+; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm18[8,9],zero,ymm18[7],zero,ymm18[5,6,7,10],zero,ymm18[8],zero,ymm18[12,13,10,11,24,25],zero,ymm18[23],zero,ymm18[21,22,23,26],zero,ymm18[24],zero,ymm18[28,29,26,27]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 = zero,zero,ymm26[9],zero,ymm26[7],zero,zero,zero,zero,ymm26[10],zero,ymm26[8],zero,zero,zero,zero,zero,zero,ymm26[25],zero,ymm26[23],zero,zero,zero,zero,ymm26[26],zero,ymm26[24],zero,zero,zero,zero
+; AVX512BW-FCP-NEXT:    vporq %ymm3, %ymm18, %ymm3
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[2,3,2,3],zmm2[2,3,2,3]
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm28, %ymm18
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm26 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
+; AVX512BW-FCP-NEXT:    # ymm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm26, %ymm29, %ymm4
+; AVX512BW-FCP-NEXT:    vporq %ymm18, %ymm4, %ymm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 = ymm29[9],zero,ymm29[7],zero,zero,zero,zero,ymm29[10],zero,ymm29[8],zero,zero,zero,zero,ymm29[11],zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero,zero,zero,ymm29[27],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm28 = zero,ymm28[7],zero,zero,zero,zero,ymm28[10],zero,ymm28[8],zero,zero,zero,zero,ymm28[11],zero,ymm28[9],zero,ymm28[23],zero,zero,zero,zero,ymm28[26],zero,ymm28[24],zero,zero,zero,zero,ymm28[27],zero,ymm28[25]
+; AVX512BW-FCP-NEXT:    vporq %ymm18, %ymm28, %ymm18
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm4[2,3,2,3]
+; AVX512BW-FCP-NEXT:    movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18
+; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm18 {%k2}
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
+; AVX512BW-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm30, %ymm4
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm29 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512BW-FCP-NEXT:    # ymm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm31, %ymm28
+; AVX512BW-FCP-NEXT:    vporq %ymm4, %ymm28, %ymm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm28 = ymm31[8,9,8,9],zero,ymm31[7],zero,ymm31[7,8,9,10],zero,ymm31[8],zero,ymm31[14,15,24,25,24,25],zero,ymm31[23],zero,ymm31[23,24,25,26],zero,ymm31[24],zero,ymm31[30,31]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm30 = zero,zero,zero,zero,ymm30[9],zero,ymm30[7],zero,zero,zero,zero,ymm30[10],zero,ymm30[8],zero,zero,zero,zero,zero,zero,ymm30[25],zero,ymm30[23],zero,zero,zero,zero,ymm30[26],zero,ymm30[24],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %ymm28, %ymm30, %ymm28
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm28[2,3,2,3],zmm4[2,3,2,3]
+; AVX512BW-FCP-NEXT:    movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060
+; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm18 {%k2}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
+; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm27, %zmm11 {%k3}
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm27
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm22, %ymm7
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm22
-; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm18, %ymm9
-; AVX512BW-FCP-NEXT:    vpor %ymm7, %ymm9, %ymm9
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm27[0],xmm22[0],xmm27[1],xmm22[1],xmm27[2],xmm22[2],xmm27[3],xmm22[3],xmm27[4],xmm22[4],xmm27[5],xmm22[5],xmm27[6],xmm22[6],xmm27[7],xmm22[7]
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm9[2,3,2,3],zmm18[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm29
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm23, %ymm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm23
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm19, %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm18 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm3
+; AVX512BW-FCP-NEXT:    vpshufb %ymm26, %ymm13, %ymm4
 ; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7]
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm4[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm19
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm3 {%k2}
-; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm24, %ymm4
-; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm20, %ymm5
-; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm5
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm23[0],xmm19[1],xmm23[1],xmm19[2],xmm23[2],xmm19[3],xmm23[3],xmm19[4],xmm23[4],xmm19[5],xmm23[5],xmm19[6],xmm23[6],xmm19[7],xmm23[7]
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[2,3,2,3],zmm6[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
-; AVX512BW-FCP-NEXT:    vpermw %zmm10, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm31
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm4[0],xmm31[0],xmm4[1],xmm31[1],xmm4[2],xmm31[2],xmm4[3],xmm31[3],xmm4[4],xmm31[4],xmm4[5],xmm31[5],xmm4[6],xmm31[6],xmm4[7],xmm31[7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm28 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm28, %xmm26, %xmm26
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm26[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm10, %ymm1
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
+; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm26
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm26[0],xmm1[1],xmm26[1],xmm1[2],xmm26[2],xmm1[3],xmm26[3],xmm1[4],xmm26[4],xmm1[5],xmm26[5],xmm1[6],xmm26[6],xmm1[7],xmm26[7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm30 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm30, %xmm0, %xmm0
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm2[2,3,2,3],zmm0[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm6 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm12, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb %ymm29, %ymm14, %ymm3
+; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %xmm3
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm29 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm0, %xmm0
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm0[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm2, %zmm2
 ; AVX512BW-FCP-NEXT:    movabsq $580999813345182728, %rax # imm = 0x810204081020408
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm5 {%k2}
+; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-FCP-NEXT:    movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm29, %xmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm18, %xmm28, %xmm20
-; AVX512BW-FCP-NEXT:    vporq %xmm5, %xmm20, %xmm5
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm20, %xmm20
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm5[0,1,0,1],zmm20[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm22, %xmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm24 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm27, %xmm28
-; AVX512BW-FCP-NEXT:    vporq %xmm5, %xmm28, %xmm5
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm27[8],xmm22[9],xmm27[9],xmm22[10],xmm27[10],xmm22[11],xmm27[11],xmm22[12],xmm27[12],xmm22[13],xmm27[13],xmm22[14],xmm27[14],xmm22[15],xmm27[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm22, %xmm22
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm22[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm20, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm23, %xmm22
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm19, %xmm27
-; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm27, %xmm22
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm23[8],xmm19[9],xmm23[9],xmm19[10],xmm23[10],xmm19[11],xmm23[11],xmm19[12],xmm23[12],xmm19[13],xmm23[13],xmm19[14],xmm23[14],xmm19[15],xmm23[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm19, %xmm19
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm22[0,1,0,1],zmm19[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm22, %zmm2
+; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm6 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm26, %xmm2
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm24
+; AVX512BW-FCP-NEXT:    vporq %xmm2, %xmm24, %xmm2
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm1[8],xmm26[9],xmm1[9],xmm26[10],xmm1[10],xmm26[11],xmm1[11],xmm26[12],xmm1[12],xmm26[13],xmm1[13],xmm26[14],xmm1[14],xmm26[15],xmm1[15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,0,1],zmm1[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm31, %xmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm4, %xmm27
+; AVX512BW-FCP-NEXT:    vporq %xmm24, %xmm27, %xmm24
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm31[8],xmm4[8],xmm31[9],xmm4[9],xmm31[10],xmm4[10],xmm31[11],xmm4[11],xmm31[12],xmm4[12],xmm31[13],xmm4[13],xmm31[14],xmm4[14],xmm31[15],xmm4[15]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,0,1],zmm4[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm24 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm27 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm5, %xmm31
+; AVX512BW-FCP-NEXT:    vporq %xmm4, %xmm31, %xmm4
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,0,1],zmm3[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm4, %zmm4
 ; AVX512BW-FCP-NEXT:    movabsq $290499906672591364, %rax # imm = 0x408102040810204
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm19 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
 ; AVX512BW-FCP-NEXT:    movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb %xmm18, %xmm12, %xmm6
-; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm6, %xmm2
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm6[0,1,0,1],zmm2[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm15, %xmm6
-; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm14, %xmm9
-; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm9, %xmm6
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,0,1],zmm6[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm24 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm19, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm17, %xmm0
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm30, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,0,1],zmm0[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm21, %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm20, %xmm3
+; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm20[0],xmm21[0],xmm20[1],xmm21[1],xmm20[2],xmm21[2],xmm20[3],xmm21[3],xmm20[4],xmm21[4],xmm20[5],xmm21[5],xmm20[6],xmm21[6],xmm20[7],xmm21[7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm28, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm6 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm16, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm17, %xmm7
-; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm7, %xmm2
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,0,1],zmm2[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512BW-FCP-NEXT:    vpermw %zmm10, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb %xmm1, %xmm22, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm23, %xmm1
+; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-FCP-NEXT:    movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm6 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm11[19],zero,ymm11[21,20,21,22],zero,ymm11[20],zero,ymm11[22,23]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero,zero
+; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero,ymm11[25]
+; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm9[18,19,20,21],zero,ymm9[19],zero,ymm9[25,26,27,22],zero,ymm9[20],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22],zero,ymm10[20]
+; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm9[23],zero,ymm9[21,22,23,26],zero,ymm9[24],zero,ymm9[28,29,26,27]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm3[2,3,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22]
+; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm14[23],zero,ymm14[23,24,25,26],zero,ymm14[24],zero,ymm14[30,31]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero
+; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm3[2,3,2,3]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
+; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
+; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 384(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, 320(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 192(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 384(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: store_i8_stride7_vf64:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rax), %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm14
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
-; AVX512DQ-BW-NEXT:    movl $338170920, %r10d # imm = 0x14281428
-; AVX512DQ-BW-NEXT:    kmovd %r10d, %k2
-; AVX512DQ-BW-NEXT:    vpshufb %ymm4, %ymm2, %ymm1 {%k2}
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512DQ-BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm10, %ymm14, %ymm5
-; AVX512DQ-BW-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[2,3,2,3],zmm1[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %ymm15
-; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm5, %ymm15, %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %ymm17
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512DQ-BW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm11, %ymm17, %ymm6
-; AVX512DQ-BW-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm17, %ymm7
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %ymm0
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm0, %ymm2
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rcx), %ymm1
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm11, %ymm1, %ymm3
+; AVX512DQ-BW-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20]
+; AVX512DQ-BW-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[0,0,1,1,4,4,5,5]
+; AVX512DQ-BW-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
 ; AVX512DQ-BW-NEXT:    movl $676341840, %r10d # imm = 0x28502850
-; AVX512DQ-BW-NEXT:    kmovd %r10d, %k3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm8, %ymm7 {%k3}
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm7[2,3,2,3]
-; AVX512DQ-BW-NEXT:    movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
-; AVX512DQ-BW-NEXT:    kmovq %r10, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %ymm16
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
-; AVX512DQ-BW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm16, %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %ymm18
-; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm8, %ymm18, %ymm12
-; AVX512DQ-BW-NEXT:    vpor %ymm3, %ymm12, %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm12, %ymm18, %ymm19
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512DQ-BW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm13, %ymm16, %ymm20
-; AVX512DQ-BW-NEXT:    vporq %ymm19, %ymm20, %ymm19
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm19[2,3,2,3],zmm3[2,3,2,3]
-; AVX512DQ-BW-NEXT:    movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
-; AVX512DQ-BW-NEXT:    kmovq %r10, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rax), %ymm3
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm19 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
-; AVX512DQ-BW-NEXT:    vpermw %zmm3, %zmm19, %zmm19
-; AVX512DQ-BW-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
-; AVX512DQ-BW-NEXT:    kmovq %rax, %k4
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm19, %zmm1 {%k4}
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm19, %ymm15, %ymm21
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm20, %ymm17, %ymm22
-; AVX512DQ-BW-NEXT:    vporq %ymm21, %ymm22, %ymm21
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm22, %ymm15, %ymm15
-; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512DQ-BW-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm25, %ymm17, %ymm17
-; AVX512DQ-BW-NEXT:    vporq %ymm15, %ymm17, %ymm15
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm15, %zmm21, %zmm15
-; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} ymm17 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm17 = ymm17[0,0,1,1,4,4,5,5]
-; AVX512DQ-BW-NEXT:    vpbroadcastd {{.*#+}} ymm21 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm21, %ymm2, %ymm17 {%k3}
+; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
+; AVX512DQ-BW-NEXT:    vpshufb %ymm4, %ymm3, %ymm6 {%k1}
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm13, %ymm5, %ymm7
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm14, %ymm3, %ymm10
+; AVX512DQ-BW-NEXT:    vpor %ymm7, %ymm10, %ymm7
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm20
+; AVX512DQ-BW-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
+; AVX512DQ-BW-NEXT:    kmovq %r10, %k3
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm20 {%k3}
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r9), %ymm2
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm15 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm15, %ymm2, %ymm6
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %ymm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm16, %ymm12, %ymm10
+; AVX512DQ-BW-NEXT:    vpor %ymm6, %ymm10, %ymm6
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm17 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm12[18],zero,ymm12[20,21,20,21],zero,ymm12[19],zero,ymm12[19,20,21,22],zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm18 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
+; AVX512DQ-BW-NEXT:    vporq %ymm17, %ymm18, %ymm17
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm23, %ymm14, %ymm14
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm24, %ymm2, %ymm2
-; AVX512DQ-BW-NEXT:    vpor %ymm2, %ymm14, %ymm2
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    movabsq $3485998880071096368, %rax # imm = 0x3060C183060C1830
-; AVX512DQ-BW-NEXT:    kmovq %rax, %k4
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm15, %zmm2 {%k4}
-; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm14, %ymm16, %ymm17
-; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm15, %ymm18, %ymm26
-; AVX512DQ-BW-NEXT:    vporq %ymm17, %ymm26, %ymm17
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm26, %ymm18, %ymm18
-; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512DQ-BW-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm28, %ymm16, %ymm16
-; AVX512DQ-BW-NEXT:    vporq %ymm18, %ymm16, %ymm16
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm16, %zmm17, %zmm16
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm29 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512DQ-BW-NEXT:    vpermw %ymm3, %ymm29, %ymm17
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm18 = ymm3[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm18, %zmm17
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm6, %zmm17
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rax), %ymm8
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512DQ-BW-NEXT:    vpermw %ymm8, %ymm6, %ymm18
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm19 = ymm8[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm18, %zmm19, %zmm18
 ; AVX512DQ-BW-NEXT:    movabsq $145249953336295682, %rax # imm = 0x204081020408102
-; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm17, %zmm16 {%k5}
+; AVX512DQ-BW-NEXT:    kmovq %rax, %k2
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm18, %zmm17 {%k2}
 ; AVX512DQ-BW-NEXT:    movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3
+; AVX512DQ-BW-NEXT:    kmovq %rax, %k2
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm17, %zmm20 {%k2}
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm17 = ymm5[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm17 = ymm17[2,2,3,3,6,6,7,7]
+; AVX512DQ-BW-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
+; AVX512DQ-BW-NEXT:    movl $338170920, %eax # imm = 0x14281428
+; AVX512DQ-BW-NEXT:    kmovd %eax, %k4
+; AVX512DQ-BW-NEXT:    vpshufb %ymm18, %ymm3, %ymm17 {%k4}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[9],zero,ymm3[7],zero,zero,zero,zero,ymm3[10],zero,ymm3[8],zero,zero,zero,zero,ymm3[11],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm5[7],zero,zero,zero,zero,ymm5[10],zero,ymm5[8],zero,zero,zero,zero,ymm5[11],zero,ymm5[9],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero,ymm5[25]
+; AVX512DQ-BW-NEXT:    vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm3[2,3,2,3],zmm17[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[8,9],zero,ymm0[7],zero,ymm0[5,6,7,10],zero,ymm0[8],zero,ymm0[12,13,10,11,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm17 = zero,zero,ymm1[9],zero,ymm1[7],zero,zero,zero,zero,ymm1[10],zero,ymm1[8],zero,zero,zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero
+; AVX512DQ-BW-NEXT:    vporq %ymm3, %ymm17, %ymm3
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm26 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm26, %ymm1, %ymm1
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm3[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-BW-NEXT:    movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18
+; AVX512DQ-BW-NEXT:    kmovq %rax, %k2
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm7 {%k2}
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
+; AVX512DQ-BW-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm24, %ymm2, %ymm0
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512DQ-BW-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm25, %ymm12, %ymm1
+; AVX512DQ-BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[8,9,8,9],zero,ymm12[7],zero,ymm12[7,8,9,10],zero,ymm12[8],zero,ymm12[14,15,24,25,24,25],zero,ymm12[23],zero,ymm12[23,24,25,26],zero,ymm12[24],zero,ymm12[30,31]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
+; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[2,3,2,3]
+; AVX512DQ-BW-NEXT:    movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060
+; AVX512DQ-BW-NEXT:    kmovq %rax, %k2
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm7 {%k2}
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
+; AVX512DQ-BW-NEXT:    vpermw %zmm8, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm16, %zmm2 {%k5}
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %ymm16
-; AVX512DQ-BW-NEXT:    vpshufb %ymm5, %ymm16, %ymm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %ymm17
-; AVX512DQ-BW-NEXT:    vpshufb %ymm11, %ymm17, %ymm11
-; AVX512DQ-BW-NEXT:    vpor %ymm5, %ymm11, %ymm5
-; AVX512DQ-BW-NEXT:    vpshufb %ymm22, %ymm16, %ymm11
-; AVX512DQ-BW-NEXT:    vpshufb %ymm25, %ymm17, %ymm18
-; AVX512DQ-BW-NEXT:    vporq %ymm11, %ymm18, %ymm11
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm11[2,3,2,3],zmm5[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %ymm25
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %ymm27
-; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm27[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,0,1,1,4,4,5,5]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm21, %ymm25, %ymm11 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm25, %ymm9
-; AVX512DQ-BW-NEXT:    vpshufb %ymm10, %ymm27, %ymm10
-; AVX512DQ-BW-NEXT:    vpor %ymm9, %ymm10, %ymm9
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm11[2,3,2,3],zmm9[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm9, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %ymm18
-; AVX512DQ-BW-NEXT:    vpshufb %ymm12, %ymm18, %ymm9
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %ymm21
-; AVX512DQ-BW-NEXT:    vpshufb %ymm13, %ymm21, %ymm10
-; AVX512DQ-BW-NEXT:    vpor %ymm9, %ymm10, %ymm9
-; AVX512DQ-BW-NEXT:    vpshufb %ymm26, %ymm18, %ymm10
-; AVX512DQ-BW-NEXT:    vpshufb %ymm28, %ymm21, %ymm11
-; AVX512DQ-BW-NEXT:    vpor %ymm10, %ymm11, %ymm10
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm29, %zmm10
-; AVX512DQ-BW-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
-; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm10, %zmm9 {%k5}
-; AVX512DQ-BW-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
-; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm9, %zmm5 {%k5}
-; AVX512DQ-BW-NEXT:    vpshufb %ymm19, %ymm16, %ymm9
-; AVX512DQ-BW-NEXT:    vpshufb %ymm20, %ymm17, %ymm10
-; AVX512DQ-BW-NEXT:    vpor %ymm9, %ymm10, %ymm9
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %xmm19
-; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm11
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %xmm20
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm22, %xmm12, %xmm12
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm12, %zmm26
-; AVX512DQ-BW-NEXT:    vpshufb %ymm23, %ymm27, %ymm9
-; AVX512DQ-BW-NEXT:    vpshufb %ymm24, %ymm25, %ymm12
-; AVX512DQ-BW-NEXT:    vpor %ymm9, %ymm12, %ymm9
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm13
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm23 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm23, %xmm24, %xmm24
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm24, %zmm9
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm7 {%k5}
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %ymm22
+; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm22, %ymm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %ymm23
+; AVX512DQ-BW-NEXT:    vpshufb %ymm11, %ymm23, %ymm1
+; AVX512DQ-BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm11
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm12
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %ymm28
+; AVX512DQ-BW-NEXT:    vpshufb %ymm13, %ymm28, %ymm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %ymm31
+; AVX512DQ-BW-NEXT:    vpshufb %ymm14, %ymm31, %ymm2
+; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm13
+; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm9
 ; AVX512DQ-BW-NEXT:    movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm26, %zmm9 {%k5}
-; AVX512DQ-BW-NEXT:    vpshufb %ymm14, %ymm21, %ymm14
-; AVX512DQ-BW-NEXT:    vpshufb %ymm15, %ymm18, %ymm15
-; AVX512DQ-BW-NEXT:    vporq %ymm14, %ymm15, %ymm24
-; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm14
-; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm28 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm26, %xmm28, %xmm28
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm24, %zmm28, %zmm24
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm28, %zmm28
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm9 {%k5}
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %ymm29
+; AVX512DQ-BW-NEXT:    vpshufb %ymm15, %ymm29, %ymm0
+; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512DQ-BW-NEXT:    vpshufb %ymm16, %ymm6, %ymm1
+; AVX512DQ-BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm15
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %xmm16
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15]
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
+; AVX512DQ-BW-NEXT:    vpermw %zmm21, %zmm1, %zmm1
 ; AVX512DQ-BW-NEXT:    movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm28, %zmm24 {%k5}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k5}
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm27
 ; AVX512DQ-BW-NEXT:    movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm24, %zmm9 {%k5}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %xmm29
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm24 = ymm27[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm27 = ymm24[2,2,3,3,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm4, %ymm25, %ymm27 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm24 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm24, %xmm4, %xmm4
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm27 = zmm27[2,3,2,3],zmm4[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm9 {%k5}
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm28[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm18, %ymm31, %ymm0 {%k4}
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %xmm30
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm27[0],xmm30[0],xmm27[1],xmm30[1],xmm27[2],xmm30[2],xmm27[3],xmm30[3],xmm27[4],xmm30[4],xmm27[5],xmm30[5],xmm27[6],xmm30[6],xmm27[7],xmm30[7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm0[2,3,2,3],zmm1[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm26, %ymm23, %ymm18
+; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm22[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm0, %ymm18 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rcx), %xmm2
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm26, %xmm0, %xmm0
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm18 {%k3}
+; AVX512DQ-BW-NEXT:    vpshufb %ymm24, %ymm29, %ymm0
+; AVX512DQ-BW-NEXT:    vpshufb %ymm25, %ymm6, %ymm3
+; AVX512DQ-BW-NEXT:    vpor %ymm0, %ymm3, %ymm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm25
-; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm17, %ymm4
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm20[0],xmm19[1],xmm20[1],xmm19[2],xmm20[2],xmm19[3],xmm20[3],xmm19[4],xmm20[4],xmm19[5],xmm20[5],xmm19[6],xmm20[6],xmm19[7],xmm20[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm16 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm16 = ymm16[0,2,3,3,4,6,7,7]
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm16, %ymm4 {%k3}
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm16, %xmm6, %xmm6
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[2,3,2,3],zmm6[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %xmm17
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm27, %zmm4 {%k4}
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm21, %ymm6
-; AVX512DQ-BW-NEXT:    vpshufb %ymm8, %ymm18, %ymm7
-; AVX512DQ-BW-NEXT:    vpor %ymm6, %ymm7, %ymm7
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm17[0],xmm25[0],xmm17[1],xmm25[1],xmm17[2],xmm25[2],xmm17[3],xmm25[3],xmm17[4],xmm25[4],xmm17[5],xmm25[5],xmm17[6],xmm25[6],xmm17[7],xmm25[7]
-; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm7[2,3,2,3],zmm8[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm8, %zmm8
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %xmm3
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm25[0],xmm3[1],xmm25[1],xmm3[2],xmm25[2],xmm3[3],xmm25[3],xmm3[4],xmm25[4],xmm3[5],xmm25[5],xmm3[6],xmm25[6],xmm3[7],xmm25[7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm24 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm24, %xmm0, %xmm0
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
+; AVX512DQ-BW-NEXT:    vpermw %zmm21, %zmm1, %zmm1
 ; AVX512DQ-BW-NEXT:    movabsq $580999813345182728, %rax # imm = 0x810204081020408
-; AVX512DQ-BW-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm8, %zmm7 {%k2}
+; AVX512DQ-BW-NEXT:    kmovq %rax, %k3
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k3}
 ; AVX512DQ-BW-NEXT:    movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
-; AVX512DQ-BW-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm7, %zmm4 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm20, %xmm8
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm18, %xmm19, %xmm21
-; AVX512DQ-BW-NEXT:    vporq %xmm8, %xmm21, %xmm8
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm22, %xmm19, %xmm19
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm29, %xmm20
-; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm21, %xmm28, %xmm22
-; AVX512DQ-BW-NEXT:    vporq %xmm20, %xmm22, %xmm20
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm23, %xmm22, %xmm22
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,0,1],zmm22[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm8, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm25, %xmm22
+; AVX512DQ-BW-NEXT:    kmovq %rax, %k3
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm18 {%k3}
+; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm28[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm4, %ymm31, %ymm0 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm31[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm31[23],zero,zero,zero,zero,ymm31[26],zero,ymm31[24],zero,zero,zero,zero,ymm31[27],zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm28[23],zero,zero,zero,zero,ymm28[26],zero,ymm28[24],zero,zero,zero,zero,ymm28[27],zero,ymm28[25]
+; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm4, %ymm1
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm22[18,19,20,21],zero,ymm22[19],zero,ymm22[25,26,27,22],zero,ymm22[20],zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm23[18],zero,zero,zero,zero,ymm23[21],zero,ymm23[19],zero,zero,zero,zero,ymm23[22],zero,ymm23[20]
+; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm4, %ymm1
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm22[23],zero,ymm22[21,22,23,26],zero,ymm22[24],zero,ymm22[28,29,26,27]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm22 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm23[25],zero,ymm23[23],zero,zero,zero,zero,ymm23[26],zero,ymm23[24],zero,zero,zero,zero
+; AVX512DQ-BW-NEXT:    vporq %ymm4, %ymm22, %ymm4
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm1[2,3,2,3],zmm4[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k2}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,ymm6[20,21,20,21],zero,ymm6[19],zero,ymm6[19,20,21,22],zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm29[20],zero,ymm29[18],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22]
+; AVX512DQ-BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero
+; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm6, %ymm1
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512DQ-BW-NEXT:    vpermw %zmm21, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
+; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
+; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm5, %xmm22
+; AVX512DQ-BW-NEXT:    vporq %xmm1, %xmm22, %xmm1
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,0,1],zmm2[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm30, %xmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm22, %xmm27, %xmm23
+; AVX512DQ-BW-NEXT:    vporq %xmm5, %xmm23, %xmm5
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm30[8],xmm27[8],xmm30[9],xmm27[9],xmm30[10],xmm27[10],xmm30[11],xmm27[11],xmm30[12],xmm27[12],xmm30[13],xmm27[13],xmm30[14],xmm27[14],xmm30[15],xmm27[15]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm23, %xmm19
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm19[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm5 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm25, %xmm19
 ; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm23 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm23, %xmm17, %xmm27
-; AVX512DQ-BW-NEXT:    vporq %xmm22, %xmm27, %xmm22
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm25[8],xmm17[9],xmm25[9],xmm17[10],xmm25[10],xmm17[11],xmm25[11],xmm17[12],xmm25[12],xmm17[13],xmm25[13],xmm17[14],xmm25[14],xmm17[15],xmm25[15]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm26, %xmm17, %xmm17
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm22[0,1,0,1],zmm17[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512DQ-BW-NEXT:    vpermw %zmm3, %zmm22, %zmm3
+; AVX512DQ-BW-NEXT:    vpshufb %xmm23, %xmm3, %xmm27
+; AVX512DQ-BW-NEXT:    vporq %xmm19, %xmm27, %xmm19
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm25[8],xmm3[9],xmm25[9],xmm3[10],xmm25[10],xmm3[11],xmm25[11],xmm3[12],xmm25[12],xmm3[13],xmm25[13],xmm3[14],xmm25[14],xmm3[15],xmm25[15]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm19[0,1,0,1],zmm3[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512DQ-BW-NEXT:    vpermw %zmm8, %zmm10, %zmm8
 ; AVX512DQ-BW-NEXT:    movabsq $290499906672591364, %rax # imm = 0x408102040810204
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm17 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm8, %zmm3 {%k1}
 ; AVX512DQ-BW-NEXT:    movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm17, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm11, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm18, %xmm10, %xmm7
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm7, %xmm3
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm16, %xmm7, %xmm7
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,0,1],zmm3[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm13, %xmm7
-; AVX512DQ-BW-NEXT:    vpshufb %xmm21, %xmm12, %xmm10
-; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm10, %xmm7
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm24, %xmm10, %xmm10
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm10[0,1,0,1],zmm7[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm5 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm11, %xmm3
+; AVX512DQ-BW-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm26, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm22, %xmm13, %xmm3
+; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1]
 ; AVX512DQ-BW-NEXT:    movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm7 {%k1}
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm14, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb %xmm23, %xmm15, %xmm8
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm8, %xmm3
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm6[0,1,0,1],zmm3[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm6, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb %xmm1, %xmm15, %xmm0
+; AVX512DQ-BW-NEXT:    vpshufb %xmm23, %xmm16, %xmm1
+; AVX512DQ-BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm24, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
+; AVX512DQ-BW-NEXT:    vpermw %zmm21, %zmm1, %zmm1
 ; AVX512DQ-BW-NEXT:    movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512DQ-BW-NEXT:    movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
 ; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm7 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, (%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 256(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 320(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 256(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 320(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 192(%rax)
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 64(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 384(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 384(%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128]
-; AVX512DQ-BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512DQ-BW-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm5, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm0[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm16, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512DQ-BW-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm17, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512DQ-BW-FCP-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm17, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm16, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[2,3,2,3],zmm0[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
-; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
-; AVX512DQ-BW-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm18, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %ymm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm22, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm12, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm12, %ymm22, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512DQ-BW-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm13, %ymm18, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm14[2,3,2,3],zmm2[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
-; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm14, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    movabsq $-9150747060186627967, %r10 # imm = 0x8102040810204081
-; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm0 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm21, %ymm1, %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm15, %ymm23, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm11, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512DQ-BW-FCP-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm25, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm11, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm15, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm11, %ymm16, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm17, %ymm26
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm1, %ymm26, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm26, %ymm16, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
-; AVX512DQ-BW-FCP-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm27, %ymm17, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm16, %ymm17, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm18, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm26
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm11, %ymm26, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm18[18,19,20,21],zero,ymm18[19],zero,ymm18[25,26,27,22],zero,ymm18[20],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm26[18],zero,zero,zero,zero,ymm26[21],zero,ymm26[19],zero,zero,zero,zero,ymm26[22],zero,ymm26[20]
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm28
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm28, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm29
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm12, %ymm29, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm28[19],zero,ymm28[21,20,21,22],zero,ymm28[20],zero,ymm28[22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22],zero,ymm29[20],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm25
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
-; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm23, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm16, %ymm18, %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm17, %ymm22, %ymm28
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm23, %ymm28, %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm28 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm22, %ymm22
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm29 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512DQ-BW-FCP-NEXT:    # ymm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm18, %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm22, %ymm18, %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm23, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm30 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm2, %ymm30, %ymm22
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm23 = ymm2[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm23, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm25 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %ymm30
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm30, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %ymm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm31, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm31[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm31[18],zero,ymm31[20,21,20,21],zero,ymm31[19],zero,ymm31[19,20,21,22],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm30[20],zero,ymm30[18],zero,zero,zero,zero,ymm30[21],zero,ymm30[19],zero,zero,zero,zero,ymm30[22]
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm9, %ymm13, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm3, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rax), %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm16, %ymm0, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm16[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm17, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $145249953336295682, %r10 # imm = 0x204081020408102
-; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm18 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm13, %zmm9 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3
-; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm1 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm18, %ymm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm22
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm22, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm19, %ymm20, %ymm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm26, %ymm22, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm27, %ymm18, %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm20, %ymm23, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm20[2,3,2,3],zmm19[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %ymm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm19, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm23, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm8, %ymm10, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm19, %ymm24
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm25, %ymm23, %ymm25
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm24, %ymm25, %ymm24
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm24[2,3,2,3],zmm8[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm20, %zmm8 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm12, %ymm20, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %ymm24
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm13, %ymm24, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm12, %ymm13, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm28, %ymm20, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm24, %ymm25
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm13, %ymm25, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm13[2,3,2,3],zmm12[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm10, %zmm30, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm13, %zmm12 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm12, %zmm8 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm19, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm21, %ymm23, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm12, %ymm13, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm25 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm25, %xmm25
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm25, %zmm26
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm11, %ymm22, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm18, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm11, %ymm14, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm27 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm27, %xmm27
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm27, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm26, %zmm11 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm16, %ymm24, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm17, %ymm20, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm16, %ymm17, %ymm27
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm28 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm28, %xmm28
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm27, %zmm28, %zmm27
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm10, %zmm28, %zmm28
+; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm25 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm13, %ymm11, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm27 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm12, %ymm13, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm8, %ymm12, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm12, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306
+; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm14, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm22, %ymm15, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %xmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %xmm23
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm2, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm28, %zmm27 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm28
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128]
+; AVX512DQ-BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm26, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm18, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm18[8,9],zero,ymm18[7],zero,ymm18[5,6,7,10],zero,ymm18[8],zero,ymm18[12,13,10,11,24,25],zero,ymm18[23],zero,ymm18[21,22,23,26],zero,ymm18[24],zero,ymm18[28,29,26,27]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 = zero,zero,ymm26[9],zero,ymm26[7],zero,zero,zero,zero,ymm26[10],zero,ymm26[8],zero,zero,zero,zero,zero,zero,ymm26[25],zero,ymm26[23],zero,zero,zero,zero,ymm26[26],zero,ymm26[24],zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm3, %ymm18, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[2,3,2,3],zmm2[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512DQ-BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm28, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm26 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
+; AVX512DQ-BW-FCP-NEXT:    # ymm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm26, %ymm29, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm18, %ymm4, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 = ymm29[9],zero,ymm29[7],zero,zero,zero,zero,ymm29[10],zero,ymm29[8],zero,zero,zero,zero,ymm29[11],zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero,zero,zero,ymm29[27],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm28 = zero,ymm28[7],zero,zero,zero,zero,ymm28[10],zero,ymm28[8],zero,zero,zero,zero,ymm28[11],zero,ymm28[9],zero,ymm28[23],zero,zero,zero,zero,ymm28[26],zero,ymm28[24],zero,zero,zero,zero,ymm28[27],zero,ymm28[25]
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm18, %ymm28, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm4[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm18 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
+; AVX512DQ-BW-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm30, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm29 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512DQ-BW-FCP-NEXT:    # ymm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm31, %ymm28
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm4, %ymm28, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm28 = ymm31[8,9,8,9],zero,ymm31[7],zero,ymm31[7,8,9,10],zero,ymm31[8],zero,ymm31[14,15,24,25,24,25],zero,ymm31[23],zero,ymm31[23,24,25,26],zero,ymm31[24],zero,ymm31[30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm30 = zero,zero,zero,zero,ymm30[9],zero,ymm30[7],zero,zero,zero,zero,ymm30[10],zero,ymm30[8],zero,zero,zero,zero,zero,zero,ymm30[25],zero,ymm30[23],zero,zero,zero,zero,ymm30[26],zero,ymm30[24],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm28, %ymm30, %ymm28
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm28[2,3,2,3],zmm4[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm18 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm27, %zmm11 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm27
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm22, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm18, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm7, %ymm9, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm27[0],xmm22[0],xmm27[1],xmm22[1],xmm27[2],xmm22[2],xmm27[3],xmm22[3],xmm27[4],xmm22[4],xmm27[5],xmm22[5],xmm27[6],xmm22[6],xmm27[7],xmm22[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm9[2,3,2,3],zmm18[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm29
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm23, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm19, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm18 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm26, %ymm13, %ymm4
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm4[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm3 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm24, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm20, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm23[0],xmm19[1],xmm23[1],xmm19[2],xmm23[2],xmm19[3],xmm23[3],xmm19[4],xmm23[4],xmm19[5],xmm23[5],xmm19[6],xmm23[6],xmm19[7],xmm23[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[2,3,2,3],zmm6[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm10, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm31
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm4[0],xmm31[0],xmm4[1],xmm31[1],xmm4[2],xmm31[2],xmm4[3],xmm31[3],xmm4[4],xmm31[4],xmm4[5],xmm31[5],xmm4[6],xmm31[6],xmm4[7],xmm31[7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm28 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm28, %xmm26, %xmm26
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm26[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm10, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm26
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm26[0],xmm1[1],xmm26[1],xmm1[2],xmm26[2],xmm1[3],xmm26[3],xmm1[4],xmm26[4],xmm1[5],xmm26[5],xmm1[6],xmm26[6],xmm1[7],xmm26[7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm30 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm30, %xmm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm2[2,3,2,3],zmm0[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm12, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm29, %ymm14, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r9), %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm29 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm0[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm2, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $580999813345182728, %rax # imm = 0x810204081020408
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm29, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm18, %xmm28, %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm5, %xmm20, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm20, %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm5[0,1,0,1],zmm20[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm22, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm24 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm24, %xmm27, %xmm28
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm5, %xmm28, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm27[8],xmm22[9],xmm27[9],xmm22[10],xmm27[10],xmm22[11],xmm27[11],xmm22[12],xmm27[12],xmm22[13],xmm27[13],xmm22[14],xmm27[14],xmm22[15],xmm27[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm22, %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm22[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm20, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm23, %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm19, %xmm27
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm27, %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm23[8],xmm19[9],xmm23[9],xmm19[10],xmm23[10],xmm19[11],xmm23[11],xmm19[12],xmm23[12],xmm19[13],xmm23[13],xmm19[14],xmm23[14],xmm19[15],xmm23[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm19, %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm22[0,1,0,1],zmm19[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm22, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm26, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm2, %xmm24, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm1[8],xmm26[9],xmm1[9],xmm26[10],xmm1[10],xmm26[11],xmm1[11],xmm26[12],xmm1[12],xmm26[13],xmm1[13],xmm26[14],xmm1[14],xmm26[15],xmm1[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,0,1],zmm1[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm31, %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm4, %xmm27
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm24, %xmm27, %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm31[8],xmm4[8],xmm31[9],xmm4[9],xmm31[10],xmm4[10],xmm31[11],xmm4[11],xmm31[12],xmm4[12],xmm31[13],xmm4[13],xmm31[14],xmm4[14],xmm31[15],xmm4[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,0,1],zmm4[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm24 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm27 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm5, %xmm31
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm4, %xmm31, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,0,1],zmm3[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm4, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $290499906672591364, %rax # imm = 0x408102040810204
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm19 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm18, %xmm12, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm6, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm6[0,1,0,1],zmm2[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm15, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm24, %xmm14, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm9, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,0,1],zmm6[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm24 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm19, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm17, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm30, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm21, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm20, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm20[0],xmm21[0],xmm20[1],xmm21[1],xmm20[2],xmm21[2],xmm20[3],xmm21[3],xmm20[4],xmm21[4],xmm20[5],xmm21[5],xmm20[6],xmm21[6],xmm20[7],xmm21[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm28, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm6 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm16, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm17, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm7, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,0,1],zmm2[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm10, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm1, %xmm22, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm23, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm11[19],zero,ymm11[21,20,21,22],zero,ymm11[20],zero,ymm11[22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero,ymm11[25]
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm9[18,19,20,21],zero,ymm9[19],zero,ymm9[25,26,27,22],zero,ymm9[20],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22],zero,ymm10[20]
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm9[23],zero,ymm9[21,22,23,26],zero,ymm9[24],zero,ymm9[28,29,26,27]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm3[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22]
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm14[23],zero,ymm14[23,24,25,26],zero,ymm14[24],zero,ymm14[30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm3[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 384(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 192(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 384(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index a03b03e120e88..7e3c534849b5b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -2946,182 +2946,189 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX2-LABEL: store_i8_stride8_vf32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $40, %rsp
+; AVX2-NEXT:    subq $72, %rsp
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-NEXT:    vmovdqa (%r10), %xmm3
 ; AVX2-NEXT:    vmovdqa (%rax), %xmm4
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovdqa (%r9), %xmm6
-; AVX2-NEXT:    vmovdqa (%r8), %xmm0
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa (%r8), %xmm8
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15]
 ; AVX2-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm9 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero
-; AVX2-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX2-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm15 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7,8],ymm15[9],ymm9[10,11,12],ymm15[13],ymm9[14,15]
-; AVX2-NEXT:    vmovaps 16(%r10), %xmm7
-; AVX2-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0],ymm12[1],ymm15[2],ymm12[3],ymm15[4],ymm12[5],ymm15[6],ymm12[7]
-; AVX2-NEXT:    vmovdqu %ymm7, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX2-NEXT:    vmovdqa 16(%rax), %xmm12
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0,1,2,3,4],ymm2[5],ymm11[6],ymm2[7],ymm11[8,9,10,11,12],ymm2[13],ymm11[14],ymm2[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm9 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
+; AVX2-NEXT:    vmovdqa (%rcx), %xmm10
+; AVX2-NEXT:    vmovdqa (%rdx), %xmm11
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm13 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7,8],ymm13[9],ymm9[10,11,12],ymm13[13],ymm9[14,15]
+; AVX2-NEXT:    vmovdqa 16(%r10), %xmm15
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4],ymm1[5],ymm13[6],ymm1[7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm13, %ymm0
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm13, %ymm2
+; AVX2-NEXT:    vmovdqa 16(%rax), %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6],ymm0[7],ymm2[8,9,10,11,12],ymm0[13],ymm2[14],ymm0[15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm11, %ymm11
-; AVX2-NEXT:    vmovdqa 16(%r9), %xmm14
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7,8],ymm11[9],ymm13[10,11,12],ymm11[13],ymm13[14,15]
-; AVX2-NEXT:    vmovdqa 16(%r8), %xmm15
+; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm0
+; AVX2-NEXT:    vmovaps 16(%r9), %xmm9
+; AVX2-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15]
+; AVX2-NEXT:    vmovdqa 16(%r8), %xmm14
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm4, %ymm11
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6],ymm11[7],ymm0[8,9,10,11,12],ymm11[13],ymm0[14],ymm11[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm8, %ymm8
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
-; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0],ymm2[1],ymm12[2],ymm2[3],ymm12[4],ymm2[5],ymm12[6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm4, %ymm12
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm6, %ymm6
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5],ymm6[6],ymm12[7],ymm6[8,9,10,11,12],ymm12[13],ymm6[14],ymm12[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; AVX2-NEXT:    vmovdqa %xmm1, %xmm9
+; AVX2-NEXT:    vmovdqa %xmm15, %xmm7
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm8, %ymm8
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm0
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm11[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm8, %ymm8
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6],ymm0[7],ymm8[8,9,10,11,12],ymm0[13],ymm8[14],ymm0[15]
-; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm7
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3],xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm11[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm15, %ymm12, %ymm12
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5],ymm12[6],ymm8[7],ymm12[8,9,10,11,12],ymm8[13],ymm12[14],ymm8[15]
+; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm6
 ; AVX2-NEXT:    vmovdqa 16(%rdx), %xmm4
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm15, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
 ; AVX2-NEXT:    vmovdqa 16(%rsi), %xmm1
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm0
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm8[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm15, %ymm14, %ymm14
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6],ymm6[7],ymm14[8,9,10,11,12],ymm6[13],ymm14[14],ymm6[15]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm15[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7,8],ymm2[9],ymm5[10,11,12],ymm2[13],ymm5[14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4],ymm5[5],ymm2[6],ymm5[7]
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm5, %ymm5
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm14, %ymm7
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6],ymm5[7],ymm7[8,9,10,11,12],ymm5[13],ymm7[14],ymm5[15]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm4, %ymm4
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm4, %ymm4
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[2,1,3,3,6,5,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm5[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm8[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
-; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5,6,7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4,5,6,7,8],ymm1[9],ymm3[10],ymm1[11],ymm3[12,13,14,15]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vpshufb %xmm5, %xmm13, %xmm3
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[2,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5,6,7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13,14,15]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm13[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm13[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5,6,7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13,14,15]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
+; AVX2-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovdqa %ymm2, 64(%rax)
-; AVX2-NEXT:    vmovdqa %ymm1, 128(%rax)
-; AVX2-NEXT:    vmovdqa %ymm12, 192(%rax)
+; AVX2-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX2-NEXT:    vmovdqa %ymm2, 128(%rax)
+; AVX2-NEXT:    vmovdqa %ymm1, 192(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm0, 224(%rax)
-; AVX2-NEXT:    vmovdqa %ymm8, 160(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
+; AVX2-NEXT:    vmovaps %ymm0, 160(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
+; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-NEXT:    addq $40, %rsp
+; AVX2-NEXT:    addq $72, %rsp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -3434,7 +3441,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqa (%rax), %xmm5
 ; AVX512-NEXT:    vmovdqa 16(%rax), %xmm12
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm22
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm24
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
@@ -3462,8 +3469,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm20
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm21
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm22
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm23
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX512-NEXT:    vmovdqa (%rcx), %xmm8
 ; AVX512-NEXT:    vmovdqa (%rdx), %xmm9
@@ -3506,18 +3513,20 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512-NEXT:    vinserti32x4 $1, %xmm3, %ymm1, %ymm19
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm0[0],xmm3[0]
 ; AVX512-NEXT:    vmovdqa 16(%rsi), %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm13
 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm20 = xmm1[2,3,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm19[2,1,3,3,6,5,7,7]
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm19[2,1,3,3,6,5,7,7]
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm21[0],zero,xmm21[1],zero,xmm21[2],zero,xmm21[3],zero
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7,8],ymm4[9],ymm1[10,11,12],ymm4[13],ymm1[14,15]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm19
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm19
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7]
@@ -3543,17 +3552,19 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15]
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15]
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7]
@@ -3579,8 +3590,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm4
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm5
 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
@@ -3778,8 +3789,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm2
 ; AVX512DQ-NEXT:    vmovdqa 16(%rax), %xmm10
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm23
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
@@ -3811,8 +3822,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm14, %ymm13, %ymm13
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm24
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm25
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm26
 ; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} ymm15 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3]
 ; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
@@ -3842,21 +3853,23 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm5[0,1,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm5, %ymm0, %ymm20
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm12, %ymm0, %ymm21
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm5, %ymm12
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm0[0],xmm5[0]
 ; AVX512DQ-NEXT:    vmovdqa 16(%rsi), %xmm1
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
 ; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm21[2,1,3,3,6,5,7,7]
-; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm22 = xmm0[2,3,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7]
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero,xmm22[2],zero,zero,zero,xmm22[3],zero,zero,zero
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7,8],ymm12[9],ymm2[10,11,12],ymm12[13],ymm2[14,15]
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm12 = xmm21[0],zero,xmm21[1],zero,xmm21[2],zero,xmm21[3],zero
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7,8],ymm12[9],ymm0[10,11,12],ymm12[13],ymm0[14,15]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm12
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm17[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
@@ -3883,25 +3896,27 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm13, %ymm11, %ymm11
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm10[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm13[0]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm11[2,1,3,3,6,5,7,7]
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm10[0,0,1,1,2,2,3,3]
-; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7,8],ymm9[9],ymm5[10,11,12],ymm9[13],ymm5[14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm11[2,1,3,3,6,5,7,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm9 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
 ; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15]
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
@@ -3923,8 +3938,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm7
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
 ; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} ymm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
@@ -5679,48 +5694,48 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX2-LABEL: store_i8_stride8_vf64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $328, %rsp # imm = 0x148
+; AVX2-NEXT:    subq $296, %rsp # imm = 0x128
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    vmovdqa (%r10), %xmm0
-; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa 48(%r10), %xmm11
-; AVX2-NEXT:    vmovdqa (%rax), %xmm1
+; AVX2-NEXT:    vmovdqa (%r10), %xmm1
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    vmovdqa 48(%r10), %xmm0
+; AVX2-NEXT:    vmovdqa (%rax), %xmm2
+; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa (%r9), %xmm0
-; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovdqa (%r9), %xmm4
+; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovdqa (%r8), %xmm3
 ; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6],ymm1[7],ymm4[8,9,10,11,12],ymm1[13],ymm4[14],ymm1[15]
-; AVX2-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa (%rdx), %xmm1
+; AVX2-NEXT:    vmovdqa (%rcx), %xmm1
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX2-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7,8],ymm1[9],ymm7[10,11,12],ymm1[13],ymm7[14,15]
 ; AVX2-NEXT:    vmovdqa 48(%rax), %xmm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
@@ -5730,35 +5745,37 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5,6,7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
+; AVX2-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa 48(%r9), %xmm3
-; AVX2-NEXT:    vmovdqa 48(%r8), %xmm5
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-NEXT:    vmovdqa 48(%r9), %xmm2
+; AVX2-NEXT:    vmovdqa 48(%r8), %xmm3
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5,6,7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13,14,15]
-; AVX2-NEXT:    vmovdqa 48(%rcx), %xmm2
-; AVX2-NEXT:    vmovdqa 48(%rdx), %xmm4
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm12, %xmm6
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm13 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15]
+; AVX2-NEXT:    vmovdqa 48(%rcx), %xmm4
+; AVX2-NEXT:    vmovdqa 48(%rdx), %xmm5
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm11[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm11[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
 ; AVX2-NEXT:    vmovdqa 48(%rsi), %xmm6
 ; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm7
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7,8],ymm13[9],ymm15[10,11,12],ymm13[13],ymm15[14,15]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm14 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7,8],ymm12[9],ymm14[10,11,12],ymm12[13],ymm14[14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4],ymm10[5],ymm13[6],ymm10[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7]
 ; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
@@ -5767,28 +5784,30 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm10, %ymm9
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6],ymm8[7],ymm9[8,9,10,11,12],ymm8[13],ymm9[14],ymm8[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm9, %ymm9
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7,8],ymm9[9],ymm0[10,11,12],ymm9[13],ymm0[14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm15[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7,8],ymm9[9],ymm10[10,11,12],ymm9[13],ymm10[14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7]
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm1, %ymm1
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[2,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm5, %ymm5
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4,5,6,7,8],ymm1[9],ymm3[10],ymm1[11],ymm3[12,13,14,15]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm2, %xmm4
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15]
@@ -5798,12 +5817,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6],ymm0[7],ymm1[8,9,10,11,12],ymm0[13],ymm1[14],ymm0[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
@@ -5828,12 +5847,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm2
 ; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm4
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm11, %xmm6
+; AVX2-NEXT:    vpshufb %xmm13, %xmm11, %xmm6
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
 ; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm6
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm7
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm15 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm15 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7,8],ymm12[9],ymm15[10,11,12],ymm12[13],ymm15[14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7]
@@ -5849,7 +5868,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm9, %ymm9
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm14[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7,8],ymm9[9],ymm10[10,11,12],ymm9[13],ymm10[14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
@@ -5866,13 +5885,13 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm2, %xmm4
+; AVX2-NEXT:    vpshufb %xmm13, %xmm2, %xmm4
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -5888,80 +5907,83 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
 ; AVX2-NEXT:    vmovdqa 16(%r10), %xmm12
-; AVX2-NEXT:    vmovdqa 16(%rax), %xmm10
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15]
+; AVX2-NEXT:    vmovdqa 16(%rax), %xmm11
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 16(%r9), %xmm5
-; AVX2-NEXT:    vmovdqa 16(%r8), %xmm4
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa 16(%r9), %xmm10
+; AVX2-NEXT:    vmovdqa 16(%r8), %xmm6
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5,6,7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13,14,15]
 ; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm7
-; AVX2-NEXT:    vmovdqa 16(%rdx), %xmm6
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm15, %xmm0
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vmovdqa 16(%rsi), %xmm3
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm13 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7,8],ymm0[9],ymm13[10,11,12],ymm0[13],ymm13[14,15]
+; AVX2-NEXT:    vmovdqa 16(%rdx), %xmm5
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vmovdqa 16(%rsi), %xmm4
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7,8],ymm1[9],ymm15[10,11,12],ymm1[13],ymm15[14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0],ymm9[1],ymm0[2],ymm9[3],ymm0[4],ymm9[5],ymm0[6],ymm9[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4],ymm9[5],ymm1[6],ymm9[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm11[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm8, %ymm8
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6],ymm0[7],ymm8[8,9,10,11,12],ymm0[13],ymm8[14],ymm0[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm8, %ymm8
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7,8],ymm8[9],ymm1[10,11,12],ymm8[13],ymm1[14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm8, %ymm8
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6],ymm1[7],ymm8[8,9,10,11,12],ymm1[13],ymm8[14],ymm1[15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm0
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm5, %ymm5
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4,5,6,7,8],ymm0[9],ymm5[10],ymm0[11],ymm5[12,13,14,15]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm3, %xmm5
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm2[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm2[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm6, %ymm6
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4,5,6,7,8],ymm0[9],ymm6[10],ymm0[11],ymm6[12,13,14,15]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm5, %ymm1
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6],ymm1[7],ymm4[8,9,10,11,12],ymm1[13],ymm4[14],ymm1[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6],ymm1[7],ymm2[8,9,10,11,12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -5983,7 +6005,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15]
-; AVX2-NEXT:    vpshufb %xmm14, %xmm6, %xmm7
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7,8],ymm7[9],ymm10[10,11,12],ymm7[13],ymm10[14,15]
@@ -6012,9 +6034,8 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovdqa %ymm0, 128(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm8, 224(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm9, 192(%rax)
+; AVX2-NEXT:    vmovdqa %ymm13, 288(%rax)
 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 288(%rax)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 256(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 352(%rax)
@@ -6032,7 +6053,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX2-NEXT:    addq $328, %rsp # imm = 0x148
+; AVX2-NEXT:    addq $296, %rsp # imm = 0x128
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll
index 7017eb60df41d..49b139fb6a9f1 100644
--- a/llvm/test/CodeGen/X86/vector-llrint.ll
+++ b/llvm/test/CodeGen/X86/vector-llrint.ll
@@ -543,11 +543,11 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vcvtsd2si %xmm1, %rax
-; AVX512-NEXT:    vmovq %rax, %xmm2
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512-NEXT:    vcvtsd2si %xmm1, %rax
 ; AVX512-NEXT:    vmovq %rax, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512-NEXT:    vcvtsd2si %xmm0, %rax
 ; AVX512-NEXT:    vmovq %rax, %xmm2
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -648,12 +648,12 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; AVX512-NEXT:    vmovq %rax, %xmm2
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
-; AVX512-NEXT:    vmovq %rax, %xmm3
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
 ; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX512-NEXT:    vcvtsd2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vcvtsd2si %xmm0, %rax
 ; AVX512-NEXT:    vmovq %rax, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-lrint.ll b/llvm/test/CodeGen/X86/vector-lrint.ll
index b1c8d46f497f3..cef83c78ce7f8 100644
--- a/llvm/test/CodeGen/X86/vector-lrint.ll
+++ b/llvm/test/CodeGen/X86/vector-lrint.ll
@@ -353,11 +353,11 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ; AVX512-i64:       # %bb.0:
 ; AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
-; AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
 ; AVX512-i64-NEXT:    vmovq %rax, %xmm1
-; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-i64-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
 ; AVX512-i64-NEXT:    vmovq %rax, %xmm2
 ; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -465,12 +465,12 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; AVX512-i64-NEXT:    vmovq %rax, %xmm2
 ; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
-; AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-i64-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
 ; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX512-i64-NEXT:    vcvtsd2si %xmm3, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
 ; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
 ; AVX512-i64-NEXT:    vmovq %rax, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
index 606beeaff750e..fd1a6ad7c4abc 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
@@ -1122,26 +1122,50 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; SSE-NEXT:    addsd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-SLOW-LABEL: test_v4f64:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v4f64:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4f64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1166,24 +1190,62 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; SSE-NEXT:    addsd %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v8f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-SLOW-LABEL: test_v8f64:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v8f64:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v8f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX2-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm2[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v8f64:
 ; AVX512:       # %bb.0:
@@ -1192,7 +1254,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
@@ -1265,38 +1327,104 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; SSE41-NEXT:    addsd {{[0-9]+}}(%rsp), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v16f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm5, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm4[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-SLOW-LABEL: test_v16f64:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm5, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm4[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm4, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v16f64:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm5, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm4[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm4, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v16f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; AVX2-NEXT:    vaddsd %xmm5, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX2-NEXT:    vaddsd %xmm5, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm2[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm3[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm4[1,0]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm1
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm4[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v16f64:
 ; AVX512:       # %bb.0:
@@ -1305,7 +1433,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm3 = zmm1[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm3
 ; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
@@ -1320,7 +1448,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm1
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm1 = zmm2[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm1
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
@@ -1410,9 +1538,9 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
 ; AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1421,9 +1549,9 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1487,16 +1615,16 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
 ; AVX2-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
 ; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1507,7 +1635,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
@@ -1619,30 +1747,30 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
 ; AVX2-NEXT:    vaddsd %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX2-NEXT:    vaddsd %xmm5, %xmm4, %xmm4
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
 ; AVX2-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
 ; AVX2-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
-; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm1
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm2[3,2,2,3]
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm1
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm3[3,2,2,3]
 ; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1653,7 +1781,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm3 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
@@ -1668,7 +1796,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
@@ -1720,24 +1848,46 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; SSE-NEXT:    addsd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f64_undef:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-SLOW-LABEL: test_v4f64_undef:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v4f64_undef:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-FAST-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v4f64_undef:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4f64_undef:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1761,23 +1911,59 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; SSE-NEXT:    addsd %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v8f64_undef:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-SLOW-LABEL: test_v8f64_undef:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v8f64_undef:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-FAST-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v8f64_undef:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v8f64_undef:
 ; AVX512:       # %bb.0:
@@ -1785,7 +1971,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
@@ -1829,37 +2015,101 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; SSE-NEXT:    addsd %xmm7, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v16f64_undef:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-SLOW-LABEL: test_v16f64_undef:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v16f64_undef:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX1-FAST-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v16f64_undef:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX2-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX2-NEXT:    vaddsd %xmm5, %xmm4, %xmm4
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX2-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm2[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm3[3,2,2,3]
+; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v16f64_undef:
 ; AVX512:       # %bb.0:
@@ -1867,7 +2117,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm3 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
@@ -1882,7 +2132,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
index 7048b98227620..caecdfb225739 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
@@ -725,37 +725,20 @@ define double @test_v4f64(<4 x double> %a0) {
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX-NEXT:    vmaxsd %xmm0, %xmm3, %xmm4
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendvpd %xmm0, %xmm3, %xmm4, %xmm0
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vblendvpd %xmm3, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vmaxsd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v4f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,2,2,3]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
 ; AVX512-NEXT:    vmaxsd %xmm0, %xmm3, %xmm4
 ; AVX512-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovsd %xmm3, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vcmpunordsd %xmm4, %xmm4, %k1
-; AVX512-NEXT:    vmaxsd %xmm4, %xmm1, %xmm0
-; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512-NEXT:    vmaxsd %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vmaxsd %xmm4, %xmm2, %xmm0
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0)
@@ -820,45 +803,25 @@ define double @test_v8f64(<8 x double> %a0) {
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v8f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmaxpd %ymm0, %ymm1, %ymm2
-; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT:    vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512BW-LABEL: test_v8f64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
 ; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
 ; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 ; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512BW-NEXT:    vpermpd {{.*#+}} zmm5 = zmm0[3,2,2,3,7,6,6,7]
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm6
 ; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm7 = xmm0[1,0]
 ; AVX512BW-NEXT:    vmaxsd %xmm0, %xmm7, %xmm8
 ; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512BW-NEXT:    vmovsd %xmm7, %xmm8, %xmm8 {%k1}
 ; AVX512BW-NEXT:    vcmpunordsd %xmm8, %xmm8, %k1
-; AVX512BW-NEXT:    vmaxsd %xmm8, %xmm5, %xmm0
-; AVX512BW-NEXT:    vmovsd %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512BW-NEXT:    vmaxsd %xmm0, %xmm6, %xmm0
+; AVX512BW-NEXT:    vmaxsd %xmm8, %xmm6, %xmm0
 ; AVX512BW-NEXT:    vmovsd %xmm6, %xmm0, %xmm0 {%k1}
 ; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512BW-NEXT:    vmaxsd %xmm0, %xmm5, %xmm0
+; AVX512BW-NEXT:    vmovsd %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512BW-NEXT:    vmaxsd %xmm0, %xmm3, %xmm0
 ; AVX512BW-NEXT:    vmovsd %xmm3, %xmm0, %xmm0 {%k1}
 ; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
@@ -879,19 +842,19 @@ define double @test_v8f64(<8 x double> %a0) {
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
 ; AVX512VL-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512VL-NEXT:    vpermpd {{.*#+}} zmm5 = zmm0[3,2,2,3,7,6,6,7]
+; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm6
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm7 = xmm0[1,0]
 ; AVX512VL-NEXT:    vmaxsd %xmm0, %xmm7, %xmm8
 ; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512VL-NEXT:    vmovsd %xmm7, %xmm8, %xmm8 {%k1}
 ; AVX512VL-NEXT:    vcmpunordsd %xmm8, %xmm8, %k1
-; AVX512VL-NEXT:    vmaxsd %xmm8, %xmm5, %xmm0
-; AVX512VL-NEXT:    vmovsd %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512VL-NEXT:    vmaxsd %xmm0, %xmm6, %xmm0
+; AVX512VL-NEXT:    vmaxsd %xmm8, %xmm6, %xmm0
 ; AVX512VL-NEXT:    vmovsd %xmm6, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmaxsd %xmm0, %xmm5, %xmm0
+; AVX512VL-NEXT:    vmovsd %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512VL-NEXT:    vmaxsd %xmm0, %xmm3, %xmm0
 ; AVX512VL-NEXT:    vmovsd %xmm3, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
@@ -1011,32 +974,6 @@ define double @test_v16f64(<16 x double> %a0) {
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v16f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmaxpd %ymm0, %ymm2, %ymm4
-; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendvpd %ymm0, %ymm2, %ymm4, %ymm0
-; AVX-NEXT:    vmaxpd %ymm1, %ymm3, %ymm2
-; AVX-NEXT:    vcmpunordpd %ymm1, %ymm1, %ymm1
-; AVX-NEXT:    vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
-; AVX-NEXT:    vmaxpd %ymm0, %ymm1, %ymm2
-; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT:    vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v16f64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmaxpd %zmm0, %zmm1, %zmm2
@@ -1051,7 +988,7 @@ define double @test_v16f64(<16 x double> %a0) {
 ; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm0 = zmm2[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
index 727af12217c67..c0d44118587eb 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
@@ -729,37 +729,20 @@ define double @test_v4f64(<4 x double> %a0) {
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX-NEXT:    vminsd %xmm0, %xmm3, %xmm4
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendvpd %xmm0, %xmm3, %xmm4, %xmm0
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT:    vminsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vblendvpd %xmm3, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vminsd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v4f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,2,2,3]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
 ; AVX512-NEXT:    vminsd %xmm0, %xmm3, %xmm4
 ; AVX512-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovsd %xmm3, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vcmpunordsd %xmm4, %xmm4, %k1
-; AVX512-NEXT:    vminsd %xmm4, %xmm1, %xmm0
-; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512-NEXT:    vminsd %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vminsd %xmm4, %xmm2, %xmm0
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vminsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a0)
@@ -824,45 +807,25 @@ define double @test_v8f64(<8 x double> %a0) {
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v8f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vminpd %ymm0, %ymm1, %ymm2
-; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vminsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT:    vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512BW-LABEL: test_v8f64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
 ; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
 ; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 ; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512BW-NEXT:    vpermpd {{.*#+}} zmm5 = zmm0[3,2,2,3,7,6,6,7]
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm6
 ; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm7 = xmm0[1,0]
 ; AVX512BW-NEXT:    vminsd %xmm0, %xmm7, %xmm8
 ; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512BW-NEXT:    vmovsd %xmm7, %xmm8, %xmm8 {%k1}
 ; AVX512BW-NEXT:    vcmpunordsd %xmm8, %xmm8, %k1
-; AVX512BW-NEXT:    vminsd %xmm8, %xmm5, %xmm0
-; AVX512BW-NEXT:    vmovsd %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512BW-NEXT:    vminsd %xmm0, %xmm6, %xmm0
+; AVX512BW-NEXT:    vminsd %xmm8, %xmm6, %xmm0
 ; AVX512BW-NEXT:    vmovsd %xmm6, %xmm0, %xmm0 {%k1}
 ; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512BW-NEXT:    vminsd %xmm0, %xmm5, %xmm0
+; AVX512BW-NEXT:    vmovsd %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512BW-NEXT:    vminsd %xmm0, %xmm3, %xmm0
 ; AVX512BW-NEXT:    vmovsd %xmm3, %xmm0, %xmm0 {%k1}
 ; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
@@ -883,19 +846,19 @@ define double @test_v8f64(<8 x double> %a0) {
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
 ; AVX512VL-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512VL-NEXT:    vpermpd {{.*#+}} zmm5 = zmm0[3,2,2,3,7,6,6,7]
+; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm6
 ; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm7 = xmm0[1,0]
 ; AVX512VL-NEXT:    vminsd %xmm0, %xmm7, %xmm8
 ; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512VL-NEXT:    vmovsd %xmm7, %xmm8, %xmm8 {%k1}
 ; AVX512VL-NEXT:    vcmpunordsd %xmm8, %xmm8, %k1
-; AVX512VL-NEXT:    vminsd %xmm8, %xmm5, %xmm0
-; AVX512VL-NEXT:    vmovsd %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512VL-NEXT:    vminsd %xmm0, %xmm6, %xmm0
+; AVX512VL-NEXT:    vminsd %xmm8, %xmm6, %xmm0
 ; AVX512VL-NEXT:    vmovsd %xmm6, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vminsd %xmm0, %xmm5, %xmm0
+; AVX512VL-NEXT:    vmovsd %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512VL-NEXT:    vminsd %xmm0, %xmm3, %xmm0
 ; AVX512VL-NEXT:    vmovsd %xmm3, %xmm0, %xmm0 {%k1}
 ; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
@@ -1015,32 +978,6 @@ define double @test_v16f64(<16 x double> %a0) {
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v16f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vminpd %ymm0, %ymm2, %ymm4
-; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendvpd %ymm0, %ymm2, %ymm4, %ymm0
-; AVX-NEXT:    vminpd %ymm1, %ymm3, %ymm2
-; AVX-NEXT:    vcmpunordpd %ymm1, %ymm1, %ymm1
-; AVX-NEXT:    vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
-; AVX-NEXT:    vminpd %ymm0, %ymm1, %ymm2
-; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vminsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT:    vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v16f64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vminpd %zmm0, %zmm1, %zmm2
@@ -1055,7 +992,7 @@ define double @test_v16f64(<16 x double> %a0) {
 ; AVX512-NEXT:    vminsd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm0 = zmm2[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vminsd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
index 17fb2ed5d13a8..a8644e368c382 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
@@ -987,26 +987,14 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; SSE-NEXT:    mulsd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v4f64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1031,25 +1019,6 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; SSE-NEXT:    mulsd %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v8f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v8f64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
@@ -1057,7 +1026,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
@@ -1130,39 +1099,6 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; SSE41-NEXT:    mulsd {{[0-9]+}}(%rsp), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v16f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm5, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm4[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v16f64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
@@ -1170,7 +1106,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm3 = zmm1[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm3
 ; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
@@ -1185,7 +1121,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm1
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm1 = zmm2[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm1
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
@@ -1239,24 +1175,13 @@ define double @test_v4f64_one(<4 x double> %a0) {
 ; SSE-NEXT:    mulsd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f64_one:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v4f64_one:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1281,31 +1206,13 @@ define double @test_v8f64_one(<8 x double> %a0) {
 ; SSE-NEXT:    mulsd %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v8f64_one:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v8f64_one:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
@@ -1350,45 +1257,13 @@ define double @test_v16f64_one(<16 x double> %a0) {
 ; SSE-NEXT:    mulsd %xmm7, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v16f64_one:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm4
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm4
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm0
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v16f64_one:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm2
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm3 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
@@ -1403,7 +1278,7 @@ define double @test_v16f64_one(<16 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
@@ -1455,24 +1330,13 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; SSE-NEXT:    mulsd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f64_undef:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v4f64_undef:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1496,31 +1360,13 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; SSE-NEXT:    mulsd %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v8f64_undef:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v8f64_undef:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
@@ -1564,45 +1410,13 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; SSE-NEXT:    mulsd %xmm7, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v16f64_undef:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm4
-; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm0
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
 ; AVX512-LABEL: test_v16f64_undef:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm3 = zmm0[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
@@ -1617,7 +1431,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll
index 16493c0448848..ca108ca732bbb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll
@@ -50,12 +50,12 @@ define <2 x double> @unpckh_unary_extracted_v8f64(<4 x double> %x) {
 ; vpermps requires a constant load for the index op. It's unlikely to be profitable.
 
 define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
-; ALL-LABEL: unpckh_unary_extracted_v8i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
+; AVX1-LABEL: unpckh_unary_extracted_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
   %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -63,12 +63,12 @@ define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
 }
 
 define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) {
-; ALL-LABEL: unpckh_unary_extracted_v8f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
+; AVX1-LABEL: unpckh_unary_extracted_v8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
   %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -158,12 +158,12 @@ define <2 x double> @unpckl_unary_extracted_v8f64(<4 x double> %x) {
 ; vpermps requires a constant load for the index op. It's unlikely to be profitable.
 
 define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
-; ALL-LABEL: unpckl_unary_extracted_v8i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
+; AVX1-LABEL: unpckl_unary_extracted_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
   %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -171,12 +171,12 @@ define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
 }
 
 define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) {
-; ALL-LABEL: unpckl_unary_extracted_v8f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
+; AVX1-LABEL: unpckl_unary_extracted_v8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
   %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -226,12 +226,14 @@ define <16 x i8> @unpckl_unary_extracted_v32i8(<32 x i8> %x) {
 ; This would infinite loop because we did not recognize the unpack shuffle mask in commuted form.
 
 define <8 x i32> @extract_unpckl_v8i32(<8 x i32> %a) {
-; ALL-LABEL: extract_unpckl_v8i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; ALL-NEXT:    retq
+; AVX1-LABEL: extract_unpckl_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX1-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i32> %shuffle
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index dbbfaab9ea26a..aefbad0fb296b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -1603,9 +1603,9 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,0,1,0,1,0,1,24,25,26,27,28,29,30,31]
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1],xmm0[8,9,10,11,12,13,14,15]
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -1655,10 +1655,11 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1
 ; XOPAVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25]
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[6,7,4,5,2,3,0,1],xmm0[14,15,12,13,10,11,8,9]
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -5383,8 +5384,10 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3
 ; XOPAVX1-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,1],xmm3[2,3],xmm2[2,3],xmm3[12,13],xmm2[12,13],xmm3[14,15],xmm2[14,15]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3]
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; XOPAVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[0,1,2,3,12,13],xmm2[14,15],xmm1[u,u,u,u,u,u,u,u]
 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; XOPAVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -5454,8 +5457,10 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2
 ; XOPAVX1-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm3 = xmm3[0,1],xmm2[8,9],xmm3[2,3],xmm2[10,11],xmm3[12,13],xmm2[0,1],xmm3[14,15],xmm2[2,3]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,0,2,3]
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; XOPAVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[8,9,10,11,0,1],xmm2[2,3],xmm1[u,u,u,u,u,u,u,u]
 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; XOPAVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -5671,8 +5676,10 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3
 ; XOPAVX1-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm3 = xmm3[0,1,4,5,2,3,6,7],xmm2[8,9,12,13,10,11,14,15]
+; XOPAVX1-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,5,7]
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 ; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11],xmm2[14,15],xmm1[u,u,u,u,u,u,u,u]
 ; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
 ; XOPAVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -6122,10 +6129,11 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_u
 ; XOPAVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,10,11,16,17,20,21,24,25,20,21]
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[8,9,10,11,8,9,10,11],xmm1[0,1,4,5,8,9,4,5]
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -6729,10 +6737,11 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u
 ; XOPAVX1-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [14,15,10,11,22,23,24,25,8,9,8,9,26,27,28,29]
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,4]
+; XOPAVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[14,15,10,11],xmm0[6,7,8,9],xmm1[8,9,8,9],xmm0[10,11,12,13]
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -6945,18 +6954,11 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
 ; AVX2-FAST-NEXT:    retq
 ;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
-; AVX512VL-SLOW:       # %bb.0:
-; AVX512VL-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX512VL-SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX512VL-SLOW-NEXT:    retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
-; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; AVX512VL-FAST-NEXT:    retq
+; AVX512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
 ; XOPAVX1:       # %bb.0:
@@ -7680,8 +7682,8 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
 ; AVX2-SLOW-LABEL: PR34369:
 ; AVX2-SLOW:       # %bb.0:
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
@@ -7690,17 +7692,30 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
 ; AVX2-SLOW-NEXT:    vpand %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    retq
 ;
-; AVX2-FAST-LABEL: PR34369:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7]
-; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-FAST-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpand %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    retq
+; AVX2-FAST-ALL-LABEL: PR34369:
+; AVX2-FAST-ALL:       # %bb.0:
+; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,6,0,5]
+; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm2, %ymm2
+; AVX2-FAST-ALL-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
+; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7]
+; AVX2-FAST-ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FAST-ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-ALL-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm1
+; AVX2-FAST-ALL-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: PR34369:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
+; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,u,u,u,u,u,u,12,13]
+; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7]
+; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-PERLANE-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    retq
 ;
 ; AVX512VL-LABEL: PR34369:
 ; AVX512VL:       # %bb.0:
@@ -7725,9 +7740,9 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
 ;
 ; XOPAVX2-LABEL: PR34369:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[3,2,2,3]
 ; XOPAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; XOPAVX2-NEXT:    vpperm {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm2[10,11],xmm0[8,9,10,11,12,13],xmm2[4,5]
+; XOPAVX2-NEXT:    vpperm {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm2[2,3],xmm0[8,9,10,11,12,13],xmm2[12,13]
 ; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index d8ee8103cee50..4afbda020b528 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2541,11 +2541,11 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_
 ;
 ; XOPAVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31]
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm2 = xmm2[0,0,0,0,0,0,0,0],xmm3[0,1,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[0,0,0,0,0,0,0,0],xmm0[8,9,10,11,12,13,14,15]
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -2596,11 +2596,11 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
 ;
 ; XOPAVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24]
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpperm %xmm4, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm2 = xmm2[7,6,5,4,3,2,1,0],xmm3[0,1,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[7,6,5,4,3,2,1,0],xmm0[15,14,13,12,11,10,9,8]
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -4687,11 +4687,23 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
 ; XOPAVX1:       # %bb.0:
@@ -5145,12 +5157,14 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
 ;
 ; XOPAVX1-LABEL: PR28136:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14],xmm0[9],xmm1[9],xmm0[11],xmm1[11],xmm0[13],xmm1[13],xmm0[15],xmm1[15]
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[1],xmm1[1],xmm0[3],xmm1[3],xmm0[5],xmm1[5],xmm0[7],xmm1[7]
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,8,u,10,u,12,u,14,u,9,u,11,u,13,u,15]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm2 = xmm0[8],xmm2[1],xmm0[10],xmm2[3],xmm0[12],xmm2[5],xmm0[14],xmm2[7],xmm0[9],xmm2[9],xmm0[11],xmm2[11],xmm0[13],xmm2[13],xmm0[15],xmm2[15]
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,0,u,2,u,4,u,6,u,1,u,3,u,5,u,7]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7],xmm0[1],xmm1[9],xmm0[3],xmm1[11],xmm0[5],xmm1[13],xmm0[7],xmm1[15]
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index fb8618be17f06..ec49be2cc761b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2348,21 +2348,57 @@ define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 {
 }
 
 define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) {
-; ALL-LABEL: unpckh_v4i64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; ALL-NEXT:    retq
+; AVX1OR2-LABEL: unpckh_v4i64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v4i64:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v4i64:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,7]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v4i64:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %unpckh = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 1, i32 7, i32 poison, i32 poison>
   ret <4 x i64> %unpckh
 }
 
 define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) {
-; ALL-LABEL: unpckh_v4f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; ALL-NEXT:    retq
+; AVX1OR2-LABEL: unpckh_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v4f64:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v4f64:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,7]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v4f64:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %unpckh = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 7, i32 poison, i32 poison>
   ret <4 x double> %unpckh
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index bd78dbded0705..ecb618f126a2a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1592,11 +1592,47 @@ define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_5555uuuu:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v8f32_5555uuuu:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8f32_5555uuuu:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: shuffle_v8f32_5555uuuu:
+; AVX2-FAST-ALL:       # %bb.0:
+; AVX2-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_5555uuuu:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_5555uuuu:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_5555uuuu:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
+; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_5555uuuu:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x float> %shuffle
 }
@@ -3389,11 +3425,47 @@ define <8 x i32> @shuffle_v8i32_44444444_bc(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_5555uuuu:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i32_5555uuuu:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i32_5555uuuu:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: shuffle_v8i32_5555uuuu:
+; AVX2-FAST-ALL:       # %bb.0:
+; AVX2-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_5555uuuu:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_5555uuuu:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_5555uuuu:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
+; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_5555uuuu:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x i32> %shuffle
 }
@@ -3859,32 +3931,41 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: broadcast_concat_crash:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: broadcast_concat_crash:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
 ;
-; AVX512VL-SLOW-LABEL: broadcast_concat_crash:
-; AVX512VL-SLOW:       # %bb.0: # %entry
-; AVX512VL-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
-; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512VL-SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX512VL-SLOW-NEXT:    retq
+; AVX2-FAST-ALL-LABEL: broadcast_concat_crash:
+; AVX2-FAST-ALL:       # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
 ;
-; AVX512VL-FAST-LABEL: broadcast_concat_crash:
-; AVX512VL-FAST:       # %bb.0: # %entry
-; AVX512VL-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
-; AVX512VL-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,4,3,3]
-; AVX512VL-FAST-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm1
-; AVX512VL-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-FAST-NEXT:    retq
+; AVX2-FAST-PERLANE-LABEL: broadcast_concat_crash:
+; AVX2-FAST-PERLANE:       # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-FAST-PERLANE-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512VL-LABEL: broadcast_concat_crash:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [7,8,7,7]
+; AVX512VL-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm1
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 entry:
   %tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %bc = bitcast <8 x float> %tmp to <4 x i64>
@@ -3899,11 +3980,29 @@ entry:
 ; PR40434: https://bugs.llvm.org/show_bug.cgi?id=40434
 
 define <8 x i32> @unpckh_v8i32(<8 x i32> %x, <8 x i32> %y) {
-; ALL-LABEL: unpckh_v8i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT:    retq
+; AVX1OR2-LABEL: unpckh_v8i32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v8i32:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v8i32:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,14,3,15]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v8i32:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %unpckh = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 2, i32 14, i32 3, i32 15, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x i32> %unpckh
 }
@@ -3911,11 +4010,29 @@ define <8 x i32> @unpckh_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; Same as above but with floats.
 
 define <8 x float> @unpckh_v8f32(<8 x float> %x, <8 x float> %y) {
-; ALL-LABEL: unpckh_v8f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT:    retq
+; AVX1OR2-LABEL: unpckh_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v8f32:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v8f32:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,14,3,15]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v8f32:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %unpckh = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 2, i32 14, i32 3, i32 15, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x float> %unpckh
 }
@@ -3923,11 +4040,29 @@ define <8 x float> @unpckh_v8f32(<8 x float> %x, <8 x float> %y) {
 ; Alternate form of the above - make sure we don't have conflicting transforms.
 
 define <8 x i32> @blend_perm_v8i32(<8 x i32> %x, <8 x i32> %y) {
-; ALL-LABEL: blend_perm_v8i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT:    retq
+; AVX1OR2-LABEL: blend_perm_v8i32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: blend_perm_v8i32:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: blend_perm_v8i32:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,14,3,15]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: blend_perm_v8i32:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %unpckh = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
   %r = shufflevector <8 x i32> %unpckh, <8 x i32> poison, <8 x i32> <i32 2, i32 6, i32 3, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x i32> %r
@@ -3936,11 +4071,29 @@ define <8 x i32> @blend_perm_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; Same as above but with floats.
 
 define <8 x float> @blend_perm_v8f32(<8 x float> %x, <8 x float> %y) {
-; ALL-LABEL: blend_perm_v8f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT:    retq
+; AVX1OR2-LABEL: blend_perm_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: blend_perm_v8f32:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: blend_perm_v8f32:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,14,3,15]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: blend_perm_v8f32:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %unpckh = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
   %r = shufflevector <8 x float> %unpckh, <8 x float> poison, <8 x i32> <i32 2, i32 6, i32 3, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x float> %r
@@ -3949,11 +4102,47 @@ define <8 x float> @blend_perm_v8f32(<8 x float> %x, <8 x float> %y) {
 ; Another variation of the above - make sure we don't have conflicting transforms.
 
 define <8 x i32> @unpckh_v8i32_unary(<8 x i32> %x) {
-; ALL-LABEL: unpckh_v8i32_unary:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT:    retq
+; AVX1-LABEL: unpckh_v8i32_unary:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: unpckh_v8i32_unary:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: unpckh_v8i32_unary:
+; AVX2-FAST-ALL:       # %bb.0:
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} xmm1 = [2,6,3,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: unpckh_v8i32_unary:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-FAST-PERLANE-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v8i32_unary:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-SLOW-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v8i32_unary:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,6,3,7]
+; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v8i32_unary:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %r = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 2, i32 6, i32 3, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x i32> %r
 }
@@ -3961,11 +4150,47 @@ define <8 x i32> @unpckh_v8i32_unary(<8 x i32> %x) {
 ; Same as above but with floats.
 
 define <8 x float> @unpckh_v8f32_unary(<8 x float> %x) {
-; ALL-LABEL: unpckh_v8f32_unary:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT:    retq
+; AVX1-LABEL: unpckh_v8f32_unary:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: unpckh_v8f32_unary:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: unpckh_v8f32_unary:
+; AVX2-FAST-ALL:       # %bb.0:
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} xmm1 = [2,6,3,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: unpckh_v8f32_unary:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-FAST-PERLANE-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v8f32_unary:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-SLOW-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v8f32_unary:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,6,3,7]
+; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v8f32_unary:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %r = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 2, i32 6, i32 3, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <8 x float> %r
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index b1efb416014b0..114a6ce5aeeae 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -412,40 +412,25 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
 
 ;FIXME: can do better with vpcompress
 define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
-; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; SLOW-NEXT:    retq
-;
-; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
-; FAST:       # %bb.0:
-; FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
-; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; FAST-NEXT:    retq
+; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; ALL-NEXT:    retq
   %res = shufflevector <16 x i32> %v, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   ret <8 x i32> %res
 }
 
 ;FIXME: can do better with vpcompress
 define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
-; SLOW-LABEL: test_v16i32_0_1_2_12:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
-; SLOW-NEXT:    vbroadcastss %xmm1, %xmm1
-; SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; SLOW-NEXT:    vzeroupper
-; SLOW-NEXT:    retq
-;
-; FAST-LABEL: test_v16i32_0_1_2_12:
-; FAST:       # %bb.0:
-; FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,1,2,12]
-; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; FAST-NEXT:    vzeroupper
-; FAST-NEXT:    retq
+; ALL-LABEL: test_v16i32_0_1_2_12:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,1,2,12]
+; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %res = shufflevector <16 x i32> %v, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 12>
   ret <4 x i32> %res
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 87c135ddcec95..bd68817aa4a46 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -466,8 +466,7 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
 ; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,4,5,6,7]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -484,8 +483,7 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
 ; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
@@ -509,8 +507,7 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
 ; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,4,5,6,7]
 ; AVX512F-NEXT:    vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -533,8 +530,7 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
 ; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -1048,10 +1044,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512F-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -1065,10 +1061,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1082,10 +1078,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
 ;
@@ -1101,11 +1097,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 define <64 x i8> @shuffle_v64i8_61_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124(<64 x i8> %a0, <64 x i8> %a1) {
 ; AVX512F-LABEL: shuffle_v64i8_61_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm3[29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28]
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm2[29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm1[29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -1118,11 +1113,10 @@ define <64 x i8> @shuffle_v64i8_61_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_
 ;
 ; AVX512DQ-LABEL: shuffle_v64i8_61_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm3[29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm2[29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; AVX512DQ-NEXT:    valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
 ; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm1[29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
@@ -1140,11 +1134,10 @@ define <64 x i8> @shuffle_v64i8_61_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_
 define <64 x i8> @shuffle_v64i8_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125(<64 x i8> %a0, <64 x i8> %a1) {
 ; AVX512F-LABEL: shuffle_v64i8_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm3[30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm2[30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm1[30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -1157,11 +1150,10 @@ define <64 x i8> @shuffle_v64i8_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_
 ;
 ; AVX512DQ-LABEL: shuffle_v64i8_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm3[30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm2[30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
+; AVX512DQ-NEXT:    valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
 ; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm1[30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
@@ -1178,11 +1170,10 @@ define <64 x i8> @shuffle_v64i8_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_
 define <64 x i8> @shuffle_v64i8_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125_126(<64 x i8> %a0, <64 x i8> %a1) {
 ; AVX512F-LABEL: shuffle_v64i8_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125_126:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -1195,11 +1186,10 @@ define <64 x i8> @shuffle_v64i8_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_
 ;
 ; AVX512DQ-LABEL: shuffle_v64i8_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125_126:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT:    valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
 ; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
@@ -1631,15 +1621,14 @@ define <64 x i8> @shuffle_v8i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01_0
 define <64 x i8> @PR54562_ref(<64 x i8> %a0) {
 ; AVX512F-LABEL: PR54562_ref:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-NEXT:    valignq {{.*#+}} zmm2 = zmm0[3,4,5,6,7,0,1,2]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: PR54562_ref:
@@ -1651,15 +1640,14 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) {
 ;
 ; AVX512DQ-LABEL: PR54562_ref:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512DQ-NEXT:    valignq {{.*#+}} zmm2 = zmm0[3,4,5,6,7,0,1,2]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
 ; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
 ; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: PR54562_ref:
@@ -1675,9 +1663,9 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) {
 define void @PR54562_mem(ptr %src, ptr %dst) {
 ; AVX512F-LABEL: PR54562_mem:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512F-NEXT:    vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512F-NEXT:    valignq {{.*#+}} zmm0 = zmm0[3,4,5,6,7,0,1,2]
+; AVX512F-NEXT:    vinserti128 $1, 32(%rdi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
@@ -1698,9 +1686,9 @@ define void @PR54562_mem(ptr %src, ptr %dst) {
 ;
 ; AVX512DQ-LABEL: PR54562_mem:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-NEXT:    valignq {{.*#+}} zmm0 = zmm0[3,4,5,6,7,0,1,2]
+; AVX512DQ-NEXT:    vinserti128 $1, 32(%rdi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index fce98cd470bcd..dd33a5b7067c9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1836,8 +1836,7 @@ define <4 x double> @test_v8f64_2346 (<8 x double> %v) {
 define <2 x double> @test_v8f64_34 (<8 x double> %v) {
 ; ALL-LABEL: test_v8f64_34:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [3,4]
-; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    valignq {{.*#+}} zmm0 = zmm0[3,4,5,6,7,0,1,2]
 ; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    ret{{[l|q]}}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 545a9d3e314a2..a22fa94a0e879 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -547,53 +547,33 @@ define <16 x float> @test_masked_permps_v16f32(ptr %vp, <16 x float> %vec2) {
 }
 
 define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) {
-; X86-AVX512-SLOW-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
-; X86-AVX512-SLOW:       # %bb.0:
-; X86-AVX512-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-SLOW-NEXT:    vpbroadcastd 44(%ecx), %xmm0
-; X86-AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 672(%eax)
-; X86-AVX512-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
-; X86-AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 832(%eax)
-; X86-AVX512-SLOW-NEXT:    vzeroupper
-; X86-AVX512-SLOW-NEXT:    retl
-;
-; X64-AVX512-SLOW-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
-; X64-AVX512-SLOW:       # %bb.0:
-; X64-AVX512-SLOW-NEXT:    vpbroadcastd 44(%rdi), %xmm0
-; X64-AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X64-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 672(%rsi)
-; X64-AVX512-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
-; X64-AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X64-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 832(%rsi)
-; X64-AVX512-SLOW-NEXT:    vzeroupper
-; X64-AVX512-SLOW-NEXT:    retq
-;
-; X86-AVX512-FAST-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
-; X86-AVX512-FAST:       # %bb.0:
-; X86-AVX512-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-FAST-NEXT:    vpbroadcastd 44(%ecx), %xmm0
-; X86-AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512-FAST-NEXT:    vmovdqa %ymm0, 672(%eax)
-; X86-AVX512-FAST-NEXT:    vmovdqa 208(%ecx), %xmm0
-; X86-AVX512-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512-FAST-NEXT:    vmovdqa %ymm0, 832(%eax)
-; X86-AVX512-FAST-NEXT:    vzeroupper
-; X86-AVX512-FAST-NEXT:    retl
-;
-; X64-AVX512-FAST-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
-; X64-AVX512-FAST:       # %bb.0:
-; X64-AVX512-FAST-NEXT:    vpbroadcastd 44(%rdi), %xmm0
-; X64-AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X64-AVX512-FAST-NEXT:    vmovdqa %ymm0, 672(%rsi)
-; X64-AVX512-FAST-NEXT:    vmovdqa 208(%rdi), %xmm0
-; X64-AVX512-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512-FAST-NEXT:    vmovdqa %ymm0, 832(%rsi)
-; X64-AVX512-FAST-NEXT:    vzeroupper
-; X64-AVX512-FAST-NEXT:    retq
+; X86-AVX512-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT:    vpbroadcastd 44(%ecx), %xmm0
+; X86-AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-AVX512-NEXT:    vmovdqa %ymm0, 672(%eax)
+; X86-AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [13,12,2,3]
+; X86-AVX512-NEXT:    vpermi2d 192(%ecx), %ymm1, %ymm0
+; X86-AVX512-NEXT:    vmovdqa %xmm0, %xmm0
+; X86-AVX512-NEXT:    vmovdqa %ymm0, 832(%eax)
+; X86-AVX512-NEXT:    vzeroupper
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vpbroadcastd 44(%rdi), %xmm0
+; X64-AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vmovdqa %ymm0, 672(%rsi)
+; X64-AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [13,12,2,3]
+; X64-AVX512-NEXT:    vpermi2d 192(%rdi), %ymm1, %ymm0
+; X64-AVX512-NEXT:    vmovdqa %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa %ymm0, 832(%rsi)
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
 ; X86-AVX512F:       # %bb.0:
@@ -602,8 +582,11 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) {
 ; X86-AVX512F-NEXT:    vpbroadcastd 44(%ecx), %xmm0
 ; X86-AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; X86-AVX512F-NEXT:    vmovdqa %ymm0, 672(%eax)
-; X86-AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
-; X86-AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [21,20,2,3]
+; X86-AVX512F-NEXT:    vmovdqa 192(%ecx), %ymm1
+; X86-AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X86-AVX512F-NEXT:    vpermt2d %zmm1, %zmm0, %zmm2
+; X86-AVX512F-NEXT:    vmovdqa %xmm2, %xmm0
 ; X86-AVX512F-NEXT:    vmovdqa %ymm0, 832(%eax)
 ; X86-AVX512F-NEXT:    vzeroupper
 ; X86-AVX512F-NEXT:    retl
@@ -613,8 +596,11 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) {
 ; X64-AVX512F-NEXT:    vpbroadcastd 44(%rdi), %xmm0
 ; X64-AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; X64-AVX512F-NEXT:    vmovdqa %ymm0, 672(%rsi)
-; X64-AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
-; X64-AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [21,20,2,3]
+; X64-AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm1
+; X64-AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-AVX512F-NEXT:    vpermt2d %zmm1, %zmm0, %zmm2
+; X64-AVX512F-NEXT:    vmovdqa %xmm2, %xmm0
 ; X64-AVX512F-NEXT:    vmovdqa %ymm0, 832(%rsi)
 ; X64-AVX512F-NEXT:    vzeroupper
 ; X64-AVX512F-NEXT:    retq
@@ -689,3 +675,8 @@ entry:
   store <16 x float> %1, ptr %a0, align 64
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64-AVX512-FAST: {{.*}}
+; X64-AVX512-SLOW: {{.*}}
+; X86-AVX512-FAST: {{.*}}
+; X86-AVX512-SLOW: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 2df013d0ff3e3..c80c85c03093e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -140,10 +140,10 @@ define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
 ; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0]
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
-; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [8,9,0,1]
+; AVX512-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
   %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
@@ -502,9 +502,9 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
 ; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
 ; X86-AVX2-NEXT:    vmovapd %ymm3, (%ecx)
-; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
 ; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; X86-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; X86-AVX2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
 ; X86-AVX2-NEXT:    vmovapd %ymm0, (%eax)
 ; X86-AVX2-NEXT:    vzeroupper
@@ -518,21 +518,23 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
-; X86-AVX512-NEXT:    vpermi2pd %zmm2, %zmm1, %zmm3
-; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
-; X86-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm4
-; X86-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
-; X86-AVX512-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
-; X86-AVX512-NEXT:    vmovapd %ymm4, (%edx)
-; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
-; X86-AVX512-NEXT:    vpermi2pd %zmm0, %zmm3, %zmm4
-; X86-AVX512-NEXT:    vmovapd %ymm4, (%ecx)
+; X86-AVX512-NEXT:    vbroadcastsd %xmm1, %ymm3
+; X86-AVX512-NEXT:    vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm3[3]
+; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [1,2,8,9]
+; X86-AVX512-NEXT:    vpermi2pd %zmm2, %zmm1, %zmm5
+; X86-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
+; X86-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
+; X86-AVX512-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
+; X86-AVX512-NEXT:    vmovapd %ymm3, (%edx)
+; X86-AVX512-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm4[2,3]
+; X86-AVX512-NEXT:    vpermpd {{.*#+}} ymm4 = ymm5[0,3,2,1]
+; X86-AVX512-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X86-AVX512-NEXT:    vmovapd %ymm3, (%ecx)
 ; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,11,0,0]
-; X86-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
-; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]
-; X86-AVX512-NEXT:    vpermi2pd %zmm3, %zmm2, %zmm0
-; X86-AVX512-NEXT:    vmovapd %ymm0, (%eax)
+; X86-AVX512-NEXT:    vpermt2pd %zmm1, %zmm3, %zmm0
+; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [2,8,9,3]
+; X86-AVX512-NEXT:    vpermi2pd %zmm0, %zmm2, %zmm1
+; X86-AVX512-NEXT:    vmovapd %ymm1, (%eax)
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
 ;
@@ -572,9 +574,9 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
 ; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
 ; X64-AVX2-NEXT:    vmovapd %ymm3, (%rsi)
-; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
 ; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; X64-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; X64-AVX2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
 ; X64-AVX2-NEXT:    vmovapd %ymm0, (%rdx)
 ; X64-AVX2-NEXT:    vzeroupper
@@ -585,21 +587,23 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
 ; X64-AVX512-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; X64-AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
-; X64-AVX512-NEXT:    vpermi2pd %zmm2, %zmm1, %zmm3
-; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
-; X64-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm4
-; X64-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
-; X64-AVX512-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
-; X64-AVX512-NEXT:    vmovapd %ymm4, (%rdi)
-; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
-; X64-AVX512-NEXT:    vpermi2pd %zmm0, %zmm3, %zmm4
-; X64-AVX512-NEXT:    vmovapd %ymm4, (%rsi)
+; X64-AVX512-NEXT:    vbroadcastsd %xmm1, %ymm3
+; X64-AVX512-NEXT:    vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm3[3]
+; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm5 = [1,2,8,9]
+; X64-AVX512-NEXT:    vpermi2pd %zmm2, %zmm1, %zmm5
+; X64-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
+; X64-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
+; X64-AVX512-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
+; X64-AVX512-NEXT:    vmovapd %ymm3, (%rdi)
+; X64-AVX512-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm4[2,3]
+; X64-AVX512-NEXT:    vpermpd {{.*#+}} ymm4 = ymm5[0,3,2,1]
+; X64-AVX512-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X64-AVX512-NEXT:    vmovapd %ymm3, (%rsi)
 ; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,11]
-; X64-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
-; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]
-; X64-AVX512-NEXT:    vpermi2pd %zmm3, %zmm2, %zmm0
-; X64-AVX512-NEXT:    vmovapd %ymm0, (%rdx)
+; X64-AVX512-NEXT:    vpermt2pd %zmm1, %zmm3, %zmm0
+; X64-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [2,8,9,3]
+; X64-AVX512-NEXT:    vpermi2pd %zmm0, %zmm2, %zmm1
+; X64-AVX512-NEXT:    vmovapd %ymm1, (%rdx)
 ; X64-AVX512-NEXT:    vzeroupper
 ; X64-AVX512-NEXT:    retq
   %t0 = shufflevector <4 x double> %v0, <4 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 68040b58858a7..d7807b0c821ab 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1566,23 +1566,14 @@ define <4 x i32> @combine_test21(<8 x i32> %a, ptr %ptr) {
 ; SSE-NEXT:    movaps %xmm2, (%rdi)
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: combine_test21:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovaps %xmm2, (%rdi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: combine_test21:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,3,2,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
-; AVX2-NEXT:    vmovaps %xmm1, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: combine_test21:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovaps %xmm2, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   store <4 x i32> %1, ptr %ptr, align 16
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
index 497f71aea2227..0b00af60b56f7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
@@ -375,46 +375,46 @@ define <64 x i8> @f3(ptr %p0) {
 ;
 ; AVX512BW-LABEL: f3:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14]
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpor %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
+; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512BW-NEXT:    movl $-2097152, %eax # imm = 0xFFE00000
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm0
-; AVX512BW-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpor %xmm0, %xmm4, %xmm0
-; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm4
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm7
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm3, %ymm3
-; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm3 {%k1}
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512BW-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm2 {%k1}
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 732cc445ddcd8..656892c979320 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -75,27 +75,13 @@ define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
-; AVX2-SLOW:       # %bb.0: # %entry
-; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr:
-; AVX2-FAST-ALL:       # %bb.0: # %entry
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,7]
-; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT:    retq
-;
-; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_ashr:
-; AVX2-FAST-PERLANE:       # %bb.0: # %entry
-; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX2-FAST-PERLANE-NEXT:    retq
+; AVX2-LABEL: trunc8i64_8i32_ashr:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: trunc8i64_8i32_ashr:
 ; AVX512:       # %bb.0: # %entry
@@ -123,27 +109,13 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
-; AVX2-SLOW:       # %bb.0: # %entry
-; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr:
-; AVX2-FAST-ALL:       # %bb.0: # %entry
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,7]
-; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT:    retq
-;
-; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_lshr:
-; AVX2-FAST-PERLANE:       # %bb.0: # %entry
-; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX2-FAST-PERLANE-NEXT:    retq
+; AVX2-LABEL: trunc8i64_8i32_lshr:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: trunc8i64_8i32_lshr:
 ; AVX512:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/vselect-packss.ll b/llvm/test/CodeGen/X86/vselect-packss.ll
index 5b14e2782ee1c..1a5af351b7fc9 100644
--- a/llvm/test/CodeGen/X86/vselect-packss.ll
+++ b/llvm/test/CodeGen/X86/vselect-packss.ll
@@ -422,8 +422,7 @@ define <16 x i8> @vselect_packss(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2,
 ; AVX512BWNOVL-LABEL: vselect_packss:
 ; AVX512BWNOVL:       # %bb.0:
 ; AVX512BWNOVL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX512BWNOVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWNOVL-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512BWNOVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWNOVL-NEXT:    vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
 ; AVX512BWNOVL-NEXT:    vzeroupper
 ; AVX512BWNOVL-NEXT:    retq
@@ -431,8 +430,7 @@ define <16 x i8> @vselect_packss(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2,
 ; AVX512BWVL-LABEL: vselect_packss:
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm3 ^ (xmm0 & (xmm2 ^ xmm3))
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/widen_fadd.ll b/llvm/test/CodeGen/X86/widen_fadd.ll
index c3700189d3d0e..6b98c108cbc68 100644
--- a/llvm/test/CodeGen/X86/widen_fadd.ll
+++ b/llvm/test/CodeGen/X86/widen_fadd.ll
@@ -217,11 +217,11 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512F-NEXT:    vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
 ; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512F-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512F-NEXT:    vmovupd %zmm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -252,16 +252,17 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm7 = mem[0],zero
 ; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
 ; AVX512VL-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
-; AVX512VL-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
-; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
-; AVX512VL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT:    vpermi2pd %zmm0, %zmm2, %zmm3
-; AVX512VL-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
+; AVX512VL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
+; AVX512VL-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT:    vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512VL-NEXT:    vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512VL-NEXT:    vmovupd %zmm0, (%rdx)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll
index fd251a99ca500..8b1f27a62cb66 100644
--- a/llvm/test/CodeGen/X86/widen_fdiv.ll
+++ b/llvm/test/CodeGen/X86/widen_fdiv.ll
@@ -155,34 +155,34 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vdivps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vdivps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vdivps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vdivps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT:    vdivps %xmm5, %xmm4, %xmm4
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512F-NEXT:    vdivps %xmm6, %xmm5, %xmm5
 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm6 = mem[0],zero
 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512F-NEXT:    vdivps %xmm7, %xmm6, %xmm6
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm7 = mem[0],zero
 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX512F-NEXT:    vdivps %xmm8, %xmm7, %xmm7
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm9 = mem[0],zero
+; AVX512F-NEXT:    vdivps %xmm9, %xmm8, %xmm8
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm9 = mem[0],zero
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm10 = mem[0],zero
+; AVX512F-NEXT:    vdivps %xmm10, %xmm9, %xmm9
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm10 = mem[0],zero
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm11 = mem[0],zero
+; AVX512F-NEXT:    vdivps %xmm11, %xmm10, %xmm10
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm11 = mem[0],zero
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm12 = mem[0],zero
+; AVX512F-NEXT:    vdivps %xmm12, %xmm11, %xmm11
+; AVX512F-NEXT:    vinsertf32x4 $1, %xmm11, %zmm10, %zmm10
+; AVX512F-NEXT:    vinsertf32x4 $1, %xmm9, %zmm8, %zmm8
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,2,8,10,0,2,8,10]
+; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2pd %zmm10, %zmm9, %zmm8
+; AVX512F-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vdivps %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm3
+; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vdivps %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpermt2pd %zmm2, %zmm9, %zmm0
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512F-NEXT:    vmovupd %zmm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/widen_fmul.ll b/llvm/test/CodeGen/X86/widen_fmul.ll
index 6c3e0ff5a9bcd..56e81fed5af8a 100644
--- a/llvm/test/CodeGen/X86/widen_fmul.ll
+++ b/llvm/test/CodeGen/X86/widen_fmul.ll
@@ -217,11 +217,11 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512F-NEXT:    vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
 ; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512F-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512F-NEXT:    vmovupd %zmm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -252,16 +252,17 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm7 = mem[0],zero
 ; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
 ; AVX512VL-NEXT:    vmulps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
-; AVX512VL-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
-; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
-; AVX512VL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT:    vpermi2pd %zmm0, %zmm2, %zmm3
-; AVX512VL-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
+; AVX512VL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
+; AVX512VL-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT:    vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512VL-NEXT:    vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512VL-NEXT:    vmovupd %zmm0, (%rdx)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/widen_fsub.ll b/llvm/test/CodeGen/X86/widen_fsub.ll
index 7405d9b7b1c65..3c5a99a342f47 100644
--- a/llvm/test/CodeGen/X86/widen_fsub.ll
+++ b/llvm/test/CodeGen/X86/widen_fsub.ll
@@ -217,11 +217,11 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512F-NEXT:    vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
 ; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512F-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512F-NEXT:    vmovupd %zmm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -252,16 +252,17 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
 ; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm7 = mem[0],zero
 ; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
 ; AVX512VL-NEXT:    vsubps %xmm8, %xmm7, %xmm7
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
-; AVX512VL-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
-; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
-; AVX512VL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT:    vpermi2pd %zmm0, %zmm2, %zmm3
-; AVX512VL-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
+; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
+; AVX512VL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
+; AVX512VL-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT:    vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512VL-NEXT:    vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512VL-NEXT:    vmovupd %zmm0, (%rdx)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index a01e6ca4b175d..43fc3ca4e1d47 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1275,116 +1275,122 @@ ret void
 define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqu (%rdi), %xmm11
+; AVX1-NEXT:    subq $40, %rsp
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm8
 ; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
 ; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm13
-; AVX1-NEXT:    vmovups 64(%rdi), %xmm0
+; AVX1-NEXT:    vmovdqu 64(%rdi), %xmm2
+; AVX1-NEXT:    vmovups 80(%rdi), %xmm0
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vmovdqu 80(%rdi), %xmm4
 ; AVX1-NEXT:    vmovdqu 96(%rdi), %xmm5
-; AVX1-NEXT:    vmovdqu 112(%rdi), %xmm2
-; AVX1-NEXT:    vmovdqu 144(%rdi), %xmm10
-; AVX1-NEXT:    vmovdqu 160(%rdi), %xmm3
-; AVX1-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
-; AVX1-NEXT:    vpshufb %xmm9, %xmm5, %xmm6
-; AVX1-NEXT:    vpshufb %xmm9, %xmm10, %xmm7
-; AVX1-NEXT:    vpshufb %xmm9, %xmm11, %xmm8
-; AVX1-NEXT:    vpshufb %xmm9, %xmm13, %xmm9
+; AVX1-NEXT:    vmovdqu 112(%rdi), %xmm3
+; AVX1-NEXT:    vmovdqu 144(%rdi), %xmm7
+; AVX1-NEXT:    vmovdqu 160(%rdi), %xmm4
+; AVX1-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
+; AVX1-NEXT:    vpshufb %xmm11, %xmm5, %xmm6
+; AVX1-NEXT:    vpshufb %xmm11, %xmm7, %xmm9
+; AVX1-NEXT:    vpshufb %xmm11, %xmm8, %xmm10
+; AVX1-NEXT:    vpshufb %xmm11, %xmm13, %xmm11
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm5, %xmm5
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
-; AVX1-NEXT:    vpshufb %xmm15, %xmm2, %xmm12
-; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpshufb %xmm15, %xmm3, %xmm12
+; AVX1-NEXT:    vmovdqa %xmm3, (%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpor %xmm5, %xmm12, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
-; AVX1-NEXT:    vpshufb %xmm15, %xmm3, %xmm12
-; AVX1-NEXT:    vpor %xmm10, %xmm12, %xmm0
+; AVX1-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
+; AVX1-NEXT:    vpshufb %xmm15, %xmm4, %xmm12
+; AVX1-NEXT:    vpor %xmm7, %xmm12, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX1-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
 ; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpshufb %xmm15, %xmm1, %xmm12
-; AVX1-NEXT:    vpor %xmm11, %xmm12, %xmm1
-; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpshufb %xmm14, %xmm13, %xmm11
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm15, %xmm1, %xmm13
-; AVX1-NEXT:    vpor %xmm11, %xmm13, %xmm11
-; AVX1-NEXT:    vpshufb %xmm14, %xmm1, %xmm13
-; AVX1-NEXT:    vpshufb %xmm15, %xmm4, %xmm5
-; AVX1-NEXT:    vpor %xmm5, %xmm13, %xmm5
-; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX1-NEXT:    vpshufb %xmm14, %xmm0, %xmm13
-; AVX1-NEXT:    vpshufb %xmm15, %xmm1, %xmm10
-; AVX1-NEXT:    vpor %xmm13, %xmm10, %xmm10
-; AVX1-NEXT:    vmovdqu 176(%rdi), %xmm13
-; AVX1-NEXT:    vpshufb %xmm14, %xmm3, %xmm0
-; AVX1-NEXT:    vpshufb %xmm15, %xmm13, %xmm12
-; AVX1-NEXT:    vpor %xmm0, %xmm12, %xmm3
-; AVX1-NEXT:    vpshufb %xmm14, %xmm2, %xmm12
+; AVX1-NEXT:    vpor %xmm8, %xmm12, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpshufb %xmm14, %xmm13, %xmm8
+; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpshufb %xmm15, %xmm2, %xmm13
+; AVX1-NEXT:    vpor %xmm8, %xmm13, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpshufb %xmm14, %xmm2, %xmm13
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    vpshufb %xmm15, %xmm2, %xmm5
+; AVX1-NEXT:    vpor %xmm5, %xmm13, %xmm8
+; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm13
+; AVX1-NEXT:    vpshufb %xmm14, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb %xmm15, %xmm13, %xmm7
+; AVX1-NEXT:    vpor %xmm0, %xmm7, %xmm5
+; AVX1-NEXT:    vmovdqu 176(%rdi), %xmm7
+; AVX1-NEXT:    vpshufb %xmm14, %xmm4, %xmm0
+; AVX1-NEXT:    vpshufb %xmm15, %xmm7, %xmm12
+; AVX1-NEXT:    vpor %xmm0, %xmm12, %xmm1
+; AVX1-NEXT:    vpshufb %xmm14, %xmm3, %xmm12
 ; AVX1-NEXT:    vmovdqu 128(%rdi), %xmm14
 ; AVX1-NEXT:    vpshufb %xmm15, %xmm14, %xmm15
 ; AVX1-NEXT:    vpor %xmm12, %xmm15, %xmm15
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm12
 ; AVX1-NEXT:    vpor %xmm6, %xmm12, %xmm12
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpshufb %xmm0, %xmm13, %xmm15
-; AVX1-NEXT:    vpor %xmm7, %xmm15, %xmm15
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpor %xmm2, %xmm8, %xmm2
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm9, %xmm0
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
-; AVX1-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm6[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm15
+; AVX1-NEXT:    vpor %xmm15, %xmm9, %xmm15
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm9[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm10, %xmm3
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm10[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm11, %xmm1
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm11[11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vpshufb %xmm10, %xmm0, %xmm8
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
-; AVX1-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
-; AVX1-NEXT:    vpor %xmm4, %xmm10, %xmm4
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
-; AVX1-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm10, %xmm1
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
+; AVX1-NEXT:    vpshufb %xmm11, %xmm2, %xmm4
+; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm4
+; AVX1-NEXT:    vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215]
+; AVX1-NEXT:    vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpshufb %xmm11, %xmm13, %xmm13
-; AVX1-NEXT:    vpor %xmm13, %xmm10, %xmm10
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
-; AVX1-NEXT:    vpshufb %xmm11, %xmm14, %xmm11
-; AVX1-NEXT:    vpor %xmm11, %xmm9, %xmm9
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm8, %xmm1
-; AVX1-NEXT:    vpshufb %xmm11, %xmm15, %xmm2
+; AVX1-NEXT:    vpor %xmm0, %xmm13, %xmm0
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
+; AVX1-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm10, %xmm7
+; AVX1-NEXT:    vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX1-NEXT:    vpshufb %xmm11, %xmm14, %xmm10
+; AVX1-NEXT:    vpor %xmm10, %xmm8, %xmm8
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
+; AVX1-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    vpshufb %xmm11, %xmm2, %xmm13
+; AVX1-NEXT:    vpor %xmm1, %xmm13, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    vpshufb %xmm11, %xmm2, %xmm4
+; AVX1-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpshufb %xmm10, %xmm15, %xmm2
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
 ; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm2, %xmm10, %xmm2
-; AVX1-NEXT:    vpaddb %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vpshufb %xmm11, %xmm12, %xmm3
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm9, %xmm2
+; AVX1-NEXT:    vpshufb %xmm10, %xmm12, %xmm3
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm4 # 16-byte Reload
+; AVX1-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
 ; AVX1-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm9, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpaddb %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm1
+; AVX1-NEXT:    addq $40, %rsp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
@@ -1889,11 +1895,19 @@ define void @splat4_v4i64_load_store(ptr %s, ptr %d) nounwind {
 }
 
 define <2 x i64> @PR37616(ptr %a0) nounwind {
-; AVX-LABEL: PR37616:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
-; AVX-NEXT:    vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT:    retq
+; AVX1OR2-LABEL: PR37616:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovaps 16(%rdi), %xmm0
+; AVX1OR2-NEXT:    vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: PR37616:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [2,6]
+; AVX512-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %load = load <16 x i64>, ptr %a0, align 128
   %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> <i32 2, i32 6>
   ret <2 x i64> %shuffle
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index eb463837c3bb8..10f2e99014d6e 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -5409,20 +5409,36 @@ define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512BW-LABEL: vec384_v6i64_to_v3i128_factor2:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    movb $5, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpexpandq %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512BW-SLOW-LABEL: vec384_v6i64_to_v3i128_factor2:
+; AVX512BW-SLOW:       # %bb.0:
+; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-SLOW-NEXT:    movb $5, %al
+; AVX512BW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512BW-SLOW-NEXT:    vpexpandq %ymm0, %ymm1 {%k1} {z}
+; AVX512BW-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT:    vzeroupper
+; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: vec384_v6i64_to_v3i128_factor2:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [2,5]
+; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
+; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-FAST-NEXT:    vpermi2q %ymm2, %ymm1, %ymm0
+; AVX512BW-FAST-NEXT:    movb $5, %al
+; AVX512BW-FAST-NEXT:    kmovd %eax, %k1
+; AVX512BW-FAST-NEXT:    vpexpandq %ymm1, %ymm1 {%k1} {z}
+; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -7188,23 +7204,23 @@ define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
-; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
-; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm3, %xmm3
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 48(%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 35f25d36cb2e9..e321b7e8054ea 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -747,31 +747,23 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX512BW-FAST-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX512BW-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX512BW-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
+; AVX512BW-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -865,31 +857,19 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -941,73 +921,91 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
 ; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
 ; AVX-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
+; AVX2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
-; AVX2-FAST-PERLANE-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vzeroupper
-; AVX2-FAST-PERLANE-NEXT:    retq
+; AVX512F-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512F-SLOW:       # %bb.0:
+; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX512F-SLOW-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-SLOW-NEXT:    vzeroupper
+; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
+; AVX512F-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512F-FAST:       # %bb.0:
+; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-FAST-NEXT:    vzeroupper
+; AVX512F-FAST-NEXT:    retq
 ;
-; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512DQ-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX512DQ-SLOW-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
 ;
-; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512DQ-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
 ;
-; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512BW-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512BW-SLOW:       # %bb.0:
+; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-SLOW-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT:    vzeroupper
+; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2011,28 +2009,16 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2163,28 +2149,16 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2315,28 +2289,16 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512BW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
+; AVX512BW-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2434,57 +2396,109 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
-; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512F-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512F-SLOW:       # %bb.0:
+; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX512F-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-SLOW-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512F-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-SLOW-NEXT:    vzeroupper
+; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512F-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512F-FAST:       # %bb.0:
+; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX512F-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-FAST-NEXT:    vpbroadcastb %xmm0, %ymm2
+; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512F-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
+; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX512F-FAST-NEXT:    vzeroupper
+; AVX512F-FAST-NEXT:    retq
 ;
-; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512DQ-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512DQ-SLOW-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQ-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vpbroadcastb %xmm0, %ymm2
+; AVX512DQ-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512DQ-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
+; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512BW-SLOW:       # %bb.0:
+; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-SLOW-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512BW-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512BW-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT:    vzeroupper
+; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-FAST-NEXT:    vpbroadcastb %xmm0, %ymm2
+; AVX512BW-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512BW-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
+; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -3561,86 +3575,160 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
 ; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
 ; SSE42-NEXT:    retq
 ;
-; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
-; AVX-NEXT:    retq
+; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
+; AVX-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX2-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw %xmm0, %ymm2
+; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-FAST-PERLANE-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
+; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-PERLANE-NEXT:    vzeroupper
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX2-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX2-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpbroadcastw %xmm0, %ymm2
+; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
+; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512F-SLOW:       # %bb.0:
+; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX512F-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-SLOW-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX512F-SLOW-NEXT:    vzeroupper
+; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX512F-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512F-FAST:       # %bb.0:
+; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512F-FAST-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
+; AVX512F-FAST-NEXT:    vpbroadcastw %xmm1, %ymm2
+; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
+; AVX512F-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[0,1],zero,zero,xmm1[0,1],zero,zero,xmm1[0,1],zero,zero
+; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
+; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-FAST-NEXT:    vzeroupper
+; AVX512F-FAST-NEXT:    retq
 ;
-; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX512F-NEXT:    vpbroadcastw %xmm1, %ymm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512DQ-SLOW-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
 ;
-; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpbroadcastw %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT:    vpbroadcastw %xmm1, %ymm2
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
+; AVX512DQ-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[0,1],zero,zero,xmm1[0,1],zero,zero,xmm1[0,1],zero,zero
+; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
 ; AVX512BW-SLOW:       # %bb.0:
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47]
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31]
+; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
 ; AVX512BW-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3839,14 +3927,16 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
 ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
 ; AVX512BW-SLOW:       # %bb.0:
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0]
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31]
+; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
-; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-SLOW-NEXT:    vzeroupper
@@ -4065,14 +4155,17 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
 ; AVX512BW-SLOW:       # %bb.0:
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47]
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31]
+; AVX512BW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-SLOW-NEXT:    vzeroupper
@@ -4270,14 +4363,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
 ; AVX512BW-SLOW:       # %bb.0:
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47]
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,0,31]
+; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
-; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-SLOW-NEXT:    vzeroupper
@@ -4418,20 +4513,38 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47]
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512BW-SLOW:       # %bb.0:
+; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31]
+; AVX512BW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT:    vzeroupper
+; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47]
+; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-FAST-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4557,18 +4670,32 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47]
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
+; AVX512BW-SLOW:       # %bb.0:
+; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,30,31]
+; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT:    vzeroupper
+; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47]
+; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-FAST-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-FAST-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4742,13 +4869,12 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX512BW-SLOW:       # %bb.0:
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
-; AVX512BW-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,0,15,0,21,0,23]
+; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-SLOW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
 ; AVX512BW-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
 ; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -4759,13 +4885,10 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX512BW-FAST:       # %bb.0:
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
-; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
-; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
-; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0]
+; AVX512BW-FAST-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-FAST-NEXT:    vzeroupper
 ; AVX512BW-FAST-NEXT:    retq
@@ -4943,13 +5066,12 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX512BW-SLOW:       # %bb.0:
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
-; AVX512BW-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,14,0,20,21,0,23]
+; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-SLOW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
 ; AVX512BW-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
-; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
 ; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -4960,13 +5082,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX512BW-FAST:       # %bb.0:
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
-; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
-; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0]
+; AVX512BW-FAST-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-FAST-NEXT:    vzeroupper
 ; AVX512BW-FAST-NEXT:    retq
@@ -5229,10 +5348,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
-; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,20,21,0,23]
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -5355,32 +5474,17 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-SLOW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0]
-; AVX512BW-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0]
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -5495,31 +5599,18 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,10,0]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,2,0]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,10,0]
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index a598e30845579..f17a2e9979207 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -641,25 +641,16 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpinsrw $6, (%rdi), %xmm0, %xmm0
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [16,25,16,27,16,29,0,23]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT:    vpermt2w (%rdi), %ymm0, %ymm1
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],mem[7]
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64
   %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
   %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
@@ -735,25 +726,18 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
+; AVX512BW-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64
   %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
   %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
@@ -3199,16 +3183,17 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
 ;
 ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
 ; AVX512F-SLOW:       # %bb.0:
-; AVX512F-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512F-SLOW-NEXT:    vpbroadcastw (%rdi), %ymm1
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512F-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vpbroadcastw (%rdi), %ymm0
+; AVX512F-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512F-SLOW-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
-; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, 32(%rdx)
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 32(%rdx)
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
@@ -3229,16 +3214,17 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
 ;
 ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
 ; AVX512DQ-SLOW:       # %bb.0:
-; AVX512DQ-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512DQ-SLOW-NEXT:    vpbroadcastw (%rdi), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vpbroadcastw (%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
-; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, 32(%rdx)
+; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, 32(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, (%rdx)
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
 ; AVX512DQ-SLOW-NEXT:    retq
 ;
@@ -3501,35 +3487,69 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
-; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rdx)
-; AVX512F-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512F-SLOW:       # %bb.0:
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX512F-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],mem[1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, 32(%rdx)
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512F-SLOW-NEXT:    vzeroupper
+; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX512DQ-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rdx)
-; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rdx)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512F-FAST-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512F-FAST:       # %bb.0:
+; AVX512F-FAST-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
+; AVX512F-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
+; AVX512F-FAST-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vmovdqa %ymm0, 32(%rdx)
+; AVX512F-FAST-NEXT:    vmovdqa %ymm1, (%rdx)
+; AVX512F-FAST-NEXT:    vzeroupper
+; AVX512F-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],mem[1,2,3,4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, 32(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, 32(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, (%rdx)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
 ; AVX512BW:       # %bb.0:
@@ -4326,7 +4346,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX-NEXT:    vmovaps 32(%rsi), %ymm2
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vmovaps %ymm2, 32(%rdx)



More information about the llvm-commits mailing list